def get_task_results(context): with open('/tmp/aprinto_celery_tail','r') as f: x = f.readlines() cols = ['task_status','task_name','task_id'] df = pd_DataFrame(columns=cols) for it in x: if it.split()[3]=='task': t=re_sub('(.*)(\\btask\\b\s)([^:]+)(: document_processing\.\\b)([^\(]+)\(([^\)]+)\)(.*)','\g<3>,\g<5>,\g<6>',it).replace('\n','') D = dict(zip(cols,t.split(','))) df = df.append(D,ignore_index=True) context.celery_results = df return context
def parse_xml(pdf): x = soup(codecs_encode(pdf,'utf8','ignore')).findAll('page') cols = ['page','font','top','left','width','height','text'] g = pd_DataFrame(columns=cols) for pg in x: idx = x.index(pg)+1 pg = str(pg) line_iter = re_findall(r'(<text.*?</text>)',pg) for it in line_iter: a = ['page']+re_findall('([a-zA-Z]+)+\=', it)+['text'] text_attrs = it[5:it.find('>')].strip() text_contents = str(soup(it).text) b = [idx]+map(lambda s: int(s),re_findall('[0-9]+', text_attrs))+[text_contents] if text_contents.strip() != '': g = g.append(dict(zip(a,b)),ignore_index=True) return g
def CHARACTERISTIC_Export(self): #找出所有的描述标定量的区域 Character = re.findall( r'/begin *?CHARACTERISTIC([\S\s]*?)/end *?CHARACTERISTIC', self.content, re.M | re.I) CharacterLen = len(Character) self.CharLength = len(Character) #创建一个空白dataframe用于存放所有标定量信息 DFCharacter = pd_DataFrame() #创建一个空白list用于存放Charname ListCharName = [0] * CharacterLen #创建一个空白list用于存放CharType ListCharType = [0] * CharacterLen #创建一个空白list用于存放CharAdd ListCharAdd = [0] * CharacterLen #创建一个空白list用于存放CharDataType ListChaDataType = [0] * CharacterLen #创建一个空白list用于存放CharConv ListCharConv = [0] * CharacterLen #创建一个空白list用于存放CharNum ListCharPNum = [1] * CharacterLen #创建一个空白list用于存放CharNum ListCharMax = [0] * CharacterLen #创建一个空白list用于存放CharNum ListCharMin = [0] * CharacterLen #创建一个空白list用于存放ListXaxisType X轴曲线类型 ListXaxisType = [0] * CharacterLen #创建一个空白list用于存放ListXaxisPNumDataType 曲线类型STD_AXIS时,X轴点数的数据类型 ListXaxisPNumDataType = [0] * CharacterLen #创建一个空白list用于存放ListXaxisOffset ListXaxisOffset = [0] * CharacterLen #创建一个空白list用于存放ListXaxisShift ListXaxisShift = [0] * CharacterLen #创建一个空白list用于存放ListXaxisPNum ListXaxisPNum = [1] * CharacterLen #创建一个空白list用于存放ListXaxisPRef ListXaxisPRef = [0] * CharacterLen #创建一个空白list用于存放ListXaxisConv ListXaxisConv = [0] * CharacterLen #创建一个空白list用于存放CharNum ListXaxisMax = [0] * CharacterLen #创建一个空白list用于存放CharNum ListXaxisMin = [0] * CharacterLen #创建一个空白list用于存放ListYaxisType Y轴曲线类型 ListYaxisType = [0] * CharacterLen #创建一个空白list用于存放ListYaxisPNumDataType 曲线类型STD_AXIS时,Y轴点数的数据类型 ListYaxisPNumDataType = [0] * CharacterLen #创建一个空白list用于存放ListYaxisOffset ListYaxisOffset = [0] * CharacterLen #创建一个空白list用于存放ListYaxisShift ListYaxisShift = [0] * CharacterLen #创建一个空白list用于存放ListYaxisPNum ListYaxisPNum = [1] * CharacterLen #创建一个空白list用于存放ListYaxisPRef ListYaxisPRef = [0] * CharacterLen #创建一个空白list用于存放LisYaxisConv ListYaxisConv = [0] * CharacterLen #创建一个空白list用于存放CharNum ListYaxisMax = [0] * CharacterLen #创建一个空白list用于存放CharNum ListYaxisMin = [0] * CharacterLen #处理具体一个的标定量块Name Long-Identifier Type Address Deposit MaxDiff #Conversion Lower-Limit Upper-Limit for i in range(CharacterLen): #去除其中某个标定量块的前后空白字符 CharChunk = Character[i].strip() #将所有“....”都替换成“Description”,去除''中空格的作用 CharChunk = re.sub(r"\".*?(?<!\\)\"", "Description", CharChunk) #将标定量块以空白符分隔 TempList = re.split('[\s]*', CharChunk) #name ListCharName[i] = TempList[0] #ListCharType ListCharType[i] = TempList[2] #添加排序字母,方便excel排序 if (ListCharType[i] == 'VALUE'): ListCharType[i] = 'a_VALUE' if (ListCharType[i] == 'VAL_BLK'): ListCharType[i] = 'b_VAL_BLK' if (ListCharType[i] == 'ASCII'): ListCharType[i] = 'c_ASCII' if (ListCharType[i] == 'CURVE'): ListCharType[i] = 'd_CURVE' if (ListCharType[i] == 'MAP'): ListCharType[i] = 'e_MAP' #ListCharAdd ListCharAdd[i] = int(TempList[3], 16) #ListChaDataType ListChaDataType[i] = TempList[4] #ListCharConv ListCharConv[i] = TempList[6] #ListCharMax ListCharMax[i] = TempList[8] #ListCharMin ListCharMin[i] = TempList[7] #数组类型的数据暂时看到有2中表示数组个数的方式,MATRIX_DIM跟NUMBER if (TempList[2] == 'b_VAL_BLK'): #将标定量块以行分隔 TempList = re.split('[\n]*', CharChunk) for j in TempList: #去除其中行前后空白字符 j = j.strip() #将行以空白符分隔 TempLineList = re.split('[\s]*', j) if TempLineList[0] == 'MATRIX_DIM': PointNumber = int(TempLineList[1]) * int( TempLineList[2]) * int(TempLineList[3]) ListCharPNum[i] = PointNumber if TempLineList[0] == 'NUMBER': ListCharPNum[i] = int(TempLineList[1]) elif (TempList[2] == 'c_ASCII'): TempList = re.split('[\n]*', CharChunk) for j in TempList: # 去除其中行前后空白字符 j = j.strip() # 将行以空白符分隔 TempLineList = re.split('[\s]*', j) if TempLineList[0] == 'NUMBER': ListCharPNum[i] = int(TempLineList[1]) #曲线类型 elif (ListCharType[i] == 'd_CURVE') or (ListCharType[i] == 'e_MAP'): #从CharChunk中提取AXIS_DESCR块 AxisDescrChunk = re.findall( r'/begin *?AXIS_DESCR([\S\s]*?)/end *?AXIS_DESCR', CharChunk, re.M | re.I) #Attribute InputQuantity Conversion MaxAxisPoints LowerLimit UpperLimit #去除其中行前后空白字符 AxisDescrChunk[0] = AxisDescrChunk[0].strip() #将X轴描述块以空白符分隔 TempList = re.split('[\s]*', AxisDescrChunk[0]) ListXaxisType[i] = TempList[0] ListXaxisConv[i] = TempList[2] #ListXaxisMax ListXaxisMax[i] = TempList[5] #ListXaxisMin ListXaxisMin[i] = TempList[4] #如果X轴类型为标准轴 if TempList[0] == 'STD_AXIS': ListXaxisPNumDataType[i] = ListChaDataType[i] #暂时以最大轴点数来确定轴点数,而不是从内存里去读取点数 ListXaxisPNum[i] = int(TempList[3]) #如果X轴类型为等间距轴 if TempList[0] == 'FIX_AXIS': #将X轴描述块从新以换行符分隔 TempList = re.split('[\n]*', AxisDescrChunk[0]) for j in TempList: j = j.strip() #将行以空白符分隔 TempLineList = re.split('[\s]*', j) if TempLineList[0] == 'FIX_AXIS_PAR': ListXaxisOffset[i] = int(TempLineList[1]) ListXaxisShift[i] = int(TempLineList[2]) ListXaxisPNum[i] = int(TempLineList[3]) if TempList[0] == 'COM_AXIS': #将X轴描述块从新以换行符分隔 TempList = re.split('[\n]*', AxisDescrChunk[0]) for j in TempList: j = j.strip() #将行以空白符分隔 TempLineList = re.split('[\s]*', j) if TempLineList[0] == 'AXIS_PTS_REF': ListXaxisPRef[i] = TempLineList[1] #ListCharPNum[i] = ListXaxisPNum[i] if ListCharType[i] == 'e_MAP': #去除其中行前后空白字符 AxisDescrChunk[1] = AxisDescrChunk[1].strip() #将X轴描述块以空白符分隔 TempList = re.split('[\s]*', AxisDescrChunk[1]) ListYaxisType[i] = TempList[0] ListYaxisConv[i] = TempList[2] #ListYaxisMax ListYaxisMax[i] = TempList[5] #ListYaxisMin ListYaxisMin[i] = TempList[4] #如果X轴类型为标准轴 if TempList[0] == 'STD_AXIS': ListYaxisPNumDataType[i] = ListChaDataType[i] #暂时以最大轴点数来确定轴点数,而不是从内存里去读取点数 ListYaxisPNum[i] = int(TempList[3]) #如果X轴类型为等间距轴 if TempList[0] == 'FIX_AXIS': #将X轴描述块从新以换行符分隔 TempList = re.split('[\n]*', AxisDescrChunk[1]) for j in TempList: j = j.strip() #将行以空白符分隔 TempLineList = re.split('[\s]*', j) if TempLineList[0] == 'FIX_AXIS_PAR': ListYaxisOffset[i] = int(TempLineList[1]) ListYaxisShift[i] = int(TempLineList[2]) ListYaxisPNum[i] = int(TempLineList[3]) if TempList[0] == 'COM_AXIS': #将X轴描述块从新以换行符分隔 TempList = re.split('[\n]*', AxisDescrChunk[1]) for j in TempList: j = j.strip() #将行以空白符分隔 TempLineList = re.split('[\s]*', j) if TempLineList[0] == 'AXIS_PTS_REF': ListYaxisPRef[i] = TempLineList[1] #ListCharPNum[i] = ListXaxisPNum[i] * ListYaxisPNum[i] DFCharacter['CharName'] = ListCharName DFCharacter['CharType'] = ListCharType DFCharacter['CharAdd'] = ListCharAdd DFCharacter['ChaDataType'] = ListChaDataType DFCharacter['CharConv'] = ListCharConv DFCharacter['CharPNum'] = ListCharPNum DFCharacter['CharMin'] = ListCharMin DFCharacter['CharMax'] = ListCharMax DFCharacter['XaxisType'] = ListXaxisType DFCharacter['XaxisPNumDataType'] = ListXaxisPNumDataType DFCharacter['XaxisOffset'] = ListXaxisOffset DFCharacter['XaxisShift'] = ListXaxisShift DFCharacter['XaxisPNum'] = ListXaxisPNum DFCharacter['XaxisPRef'] = ListXaxisPRef DFCharacter['XaxisConv'] = ListXaxisConv DFCharacter['XaxisMin'] = ListXaxisMin DFCharacter['XaxisMax'] = ListXaxisMax DFCharacter['YaxisType'] = ListYaxisType DFCharacter['YaxisPNumDataType'] = ListYaxisPNumDataType DFCharacter['YaxisOffset'] = ListYaxisOffset DFCharacter['YaxisShift'] = ListYaxisShift DFCharacter['YaxisPNum'] = ListYaxisPNum DFCharacter['YaxisPRef'] = ListYaxisPRef DFCharacter['YaxisConv'] = ListYaxisConv DFCharacter['YaxisMin'] = ListYaxisMin DFCharacter['YaxisMax'] = ListYaxisMax return DFCharacter
def COMPU_METHOD_Export(self): #找出所有的描述转换公式的区域 ConvMoth = re.findall( r'/begin *?COMPU_METHOD([\S\s]*?)/end *?COMPU_METHOD', self.content, re.M | re.I) #创建一个空白dataframe用于存放所有数据类型信息 DFConvInfo = pd_DataFrame() #创建一个空白list用于存放name ListConvName = [] #创建一个空白list用于存放unit ListUnit = [] #创建一个空白list用于存放A ListA = [] #创建一个空白list用于存放B ListB = [] #创建一个空白list用于存放C ListC = [] #创建一个空白list用于存放D ListD = [] #创建一个空白list用于存放E ListE = [] #创建一个空白list用于存放F ListF = [] #处理具体一个的转换公式块 for i in ConvMoth: #去除其中某个描述转换公式块的前后空白字符 i = i.strip() #将所有“ ”都替换成“Description”,去除''中空格的作用 # i = re.sub(r"\"\s+\"", "Description", i) #将描述转换公式块以空白符分隔 TempList = re.split('[\s]*', i) #name ListConvName.append(TempList[0]) #unit # try: # print TempList[4] TempList[4] = TempList[4].decode('gbk') # print u'%s'%(TempList[4]) # except: # pass ListUnit.append(TempList[4]) #暂时只做RAT_FUNC类型的公式 if TempList[2] == 'RAT_FUNC': #重新将描述转换公式块以换行符分隔 ConvInfoList = re.split('[\n]*', i) #对分隔后的行进行选择性提取 for j in ConvInfoList: #去除行前后的空格 j = j.strip() #将行按空格进行分隔 ListLine = j.split() #如果读到COEFFS开头的行,则提取数据 if ListLine[0] == 'COEFFS': ListA.append(float(ListLine[1])) ListB.append(float(ListLine[2])) ListC.append(float(ListLine[3])) ListD.append(float(ListLine[4])) ListE.append(float(ListLine[5])) ListF.append(float(ListLine[6])) else: ListA.append(0) ListB.append(1) ListC.append(0) ListD.append(0) ListE.append(0) ListF.append(1) ListConvName.append('NO_COMPU_METHOD') ListUnit.append('') ListA.append(0) ListB.append(1) ListC.append(0) ListD.append(0) ListE.append(0) ListF.append(1) #将数据类型name跟公式信息2个list添加到dataframe DFConvInfo['Name'] = ListConvName DFConvInfo['Unit'] = ListUnit DFConvInfo['A'] = ListA DFConvInfo['B'] = ListB DFConvInfo['C'] = ListC DFConvInfo['D'] = ListD DFConvInfo['E'] = ListE DFConvInfo['F'] = ListF return DFConvInfo
def write2csv_tpot(X, y, outfName, feat_list): dat = np_hstack((X, y[:, None])) df = pd_DataFrame(dat) df.to_csv(path_or_buf=outfName, index=False, header=feat_list)
def _get_xbrl_datas(xbrl_file, xbrl_file_data): """データ取得""" # xbrlファイル読み込み if RE_XBRL_P_V1_MATCH(os_basename(xbrl_file)): # 旧 EDINET XBRL # print(xbrl_file) xbrl = xbrl_jpfr_Parser(xbrl_file, xbrl_file_data) xbrl_ver = 1 elif RE_XBRL_P_V2_MATCH(os_basename(xbrl_file)): # print(xbrl_file) xbrl = xbrl_jpcor_Parser(xbrl_file, xbrl_file_data) xbrl_ver = 2 else: # 監査報告書のXBRLが該当(jpaud-***.xbrl) # print('未対応のファイル名 %s' % xbrl_file) return None # データをリストに変換 data_labels = [ 'version', '提出日', '提出回数', '報告対象期間期末日', '追番', '第N期', '名前空間接頭辞', 'tag', 'id', 'context', '開始日', '終了日', '期末日', '連結', '値', ] context_tags = xbrl.context_tags xbrl_infos = [ xbrl_ver, xbrl.info['提出日'], xbrl.info['提出回数'], xbrl.info['報告対象期間期末日'], xbrl.info['追番'], xbrl.info['第N期'], ] datas = [] datas_append = datas.append xbrl_standard = xbrl.info['会計基準'] if '会計基準' in xbrl.info else None # xbrl.xbrl_datasの種類(namespaceに対応する接頭辞) # 管理情報(jpfr-di, ifrs, jpdei_cor) # 表紙・サマリ・本文など(jpcrp_cor) # 財務諸表(jpfr-t-***, ifrs, jppfs_cor) # 提出者別タクソノミ(*E00000*) for (namespace, xbrl_data) in xbrl.xbrl_datas: # キーのタプル(タグ名・コンテキスト・ID) # 値の辞書(属性・テキスト) for ((t_tag, t_context_ref, t_id), v) in xbrl_data.items(): # タグ名から名前空間を分離 & 接頭辞に変換 (t_ns, t_tag_name) = t_tag.rsplit('}', maxsplit=1) try: datas_append( # XBRLバージョンと文書情報 xbrl_infos + # 名前空間接頭辞 タグ名 id属性 コンテキスト [ xbrl.ns_prefixes[t_ns.lstrip('{')], t_tag_name, t_id, t_context_ref, ] + # 開始日 終了日 期末日 _get_dates(context_tags[t_context_ref]['period']) + # 連結区分 型変換した値 [ _get_consolidated_or_nonconsolidated( context_tags[t_context_ref], xbrl_ver, xbrl_standard), conv_str_to_num(v['text']), ]) except: print(format_exc()) del (xbrl, xbrl_infos, context_tags) # データフレームに変換 df = pd_DataFrame(datas, columns=data_labels) del (datas, data_labels) def df_conv_str_to_datetime(t_colulmn_name): """文字列 -> 日付変換""" try: df[t_colulmn_name] = pd_to_datetime(df[t_colulmn_name]) except (TypeError, ValueError): print('変換エラー %s conv_str_to_num で再試行' % t_colulmn_name) df[t_colulmn_name] = df[t_colulmn_name].apply(conv_str_to_num) return for colulmn_name in ('提出日', '開始日', '終了日', '期末日'): df_conv_str_to_datetime(colulmn_name) return df
def esearch_disease(DISEASE_LIST, OUTDIR): CREATE_DIR(OUTDIR) DISEASE_DIC = MAKE_DICIONARY(DISEASE_LIST) # data frame to store all Counts # +2 for one extra line for "COUNTS" and "TOTAL1" df = pd_DataFrame(index=range(0, len(DISEASE_DIC) + 2), columns=range(0, 8)) df.columns=["disease","COD","QUERY1","QUERY2","QUERY3","QUERY4",\ "QUERY5","TOTAL2"] COL1 = list(DISEASE_DIC) COL1.append('COUNTS') COL1.append('TOTAL1') df['disease'] = COL1 # data frame to store all the commands used for each search COMMAND = pd_DataFrame(index=range(0, len(DISEASE_DIC)), columns=range(0, 8)) COMMAND.columns=["disease","COD","QUERY1","QUERY2","QUERY3","QUERY4",\ "QUERY5","END"] COMMAND["disease"] = COL1[0:len(DISEASE_DIC)] COMMAND["END"] = '.' # data frameto store the queries' explanations QUERY_description = pd_DataFrame(index=range(0, 5), columns=range(0, 1)) QUERY_description.columns = ["DESCRIPTION"] QUERY_description.index = [ "QUERY1", "QUERY2", "QUERY3", "QUERY4", "QUERY5" ] QUERY1_desc='Procura o nome da doença em todos os campos e filtra por'\ ' experimentos de expressão gênica feitos com amostras '\ 'humanas. Essa é a QUERY mais abrangente.' QUERY2_desc='Igual a QUERY1 só que também procura por "patient" OU '\ '"patients" em todos os campos' QUERY3_desc='Igual a QUERY2 só que também filtra por bioprojects '\ 'presentes na base de dados SRA' QUERY4_desc='Procura o nome da doença somente no título do bioproject, '\ 'procura por "patient" OU "patients" em todos os campos e '\ 'filtra por experimentos de expressão gênica feitos com '\ 'amostras humanas' QUERY5_desc='Igual a QUERY4 só que também filtra por bioprojects '\ 'presentes na base de dados SRA' QUERY_description["DESCRIPTION"]=[QUERY1_desc,QUERY2_desc,QUERY3_desc,\ QUERY4_desc,QUERY5_desc] IdList_QUERY1 = [] IdList_QUERY2 = [] IdList_QUERY3 = [] IdList_QUERY4 = [] IdList_QUERY5 = [] IdList_total = [] N = 0 for DISEASE in list(DISEASE_DIC): print(str(N) + '\t' + DISEASE) COD = DISEASE_DIC[DISEASE] df["COD"][N] = COD COMMAND["COD"][N] = COD QUERY_DIC={'1':'("'+DISEASE+'"[All Fields])AND'\ '("transcriptome gene expression"[Filter]AND"org '\ 'human"[Filter])', '2':'("'+DISEASE+'"[All Fields]AND'\ '("patient"[All Fields]OR"patients"[All Fields])AND'\ '("transcriptome gene expression"[Filter]AND"org '\ 'human"[Filter])', '3':'("'+DISEASE+'"[All Fields]AND'\ '("patient"[All Fields]OR"patients"[All Fields])AND'\ '("transcriptome gene expression"[Filter]AND"org '\ 'human"[Filter]AND"bioproject sra"[Filter])', '4':'("'+DISEASE+'"[Title]AND'\ '("patient"[All Fields]OR"patients"[All Fields])AND'\ '("transcriptome gene expression"[Filter]AND"org '\ 'human"[Filter])', '5':'("'+DISEASE+'"[Title]AND'\ '("patient"[All Fields]OR"patients"[All Fields])AND'\ '("transcriptome gene expression"[Filter]AND"org '\ 'human"[Filter])AND"bioproject sra"[Filter])'} Idlist_disease = [] ROUND = ['1', '2', '3', '4', '5'] for R in ROUND: QUERY = 'QUERY' + R TERM = QUERY_DIC[R] # COMMAND[locals[QUERY]][N]=TERM handle = Entrez.esearch(db="bioproject", retmax=1000, term=TERM) record = Entrez.read(handle) handle.close() if int(record["Count"]) > 1000: print('\nATTENTION!\nn'+record["Count"]+' bioprojects are '\ 'related to this esearch and only 1000 will be written '\ 'to the Idlist for the further analysis.\n\n'+QUERY+\ 'for '+DISEASE+'\n\n'+QUERY_DIC[R]+'\n') exit # MONTAR LISTA POR DOENÇA Idlist_disease += list(record["IdList"]) IdList_total += list(record["IdList"]) # ADD IDS TO QUERY AND TOTAL LISTS # IdList_total+=record["IdList"] if R == '1': IdList_QUERY1 += list(record["IdList"]) COMMAND['QUERY1'][N] = TERM df['QUERY1'][N] = int(record["Count"]) elif R == '2': IdList_QUERY2 += list(record["IdList"]) COMMAND['QUERY2'][N] = TERM df['QUERY2'][N] = int(record["Count"]) elif R == '3': IdList_QUERY3 += list(record["IdList"]) COMMAND['QUERY3'][N] = TERM df['QUERY3'][N] = int(record["Count"]) elif R == '4': IdList_QUERY4 += list(record["IdList"]) COMMAND['QUERY4'][N] = TERM df['QUERY4'][N] = int(record["Count"]) elif R == '5': IdList_QUERY5 += list(record["IdList"]) COMMAND['QUERY5'][N] = TERM df['QUERY5'][N] = int(record["Count"]) #remove replicates from the list Idlist_disease = list(set(Idlist_disease)) df['TOTAL2'][N] = len(Idlist_disease) outfile = pathjoin(OUTDIR, "IdListDIR/disease", COD + ".txt") with open(outfile, 'w') as f: print("\n".join(Idlist_disease), file=f) f.close() N += 1 #preencher a linha com totais for COL in list(df)[2:len(df)]: #COL da terceira coluna até a última df[COL][len(DISEASE_DIC)] = df[COL][0:len(DISEASE_DIC)].sum(axis=0) # ESCREVER DEMAIS LISTAS PARA ARQUIVOS TXT IdList_total = list(set(IdList_total)) outfile = pathjoin(OUTDIR, "IdListDIR/IdList_total.txt") with open(outfile, 'w') as f: print("\n".join(IdList_total), file=f) f.close() IdList_QUERY1 = list(set(IdList_QUERY1)) df.loc[len(DISEASE_DIC) + 1, "QUERY1"] = len(IdList_QUERY1) outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY1.txt") with open(outfile, 'w') as f: print("\n".join(IdList_QUERY1), file=f) f.close() IdList_QUERY2 = list(set(IdList_QUERY2)) df.loc[len(DISEASE_DIC) + 1, "QUERY2"] = len(IdList_QUERY2) outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY2.txt") with open(outfile, 'w') as f: print("\n".join(IdList_QUERY2), file=f) f.close() IdList_QUERY3 = list(set(IdList_QUERY3)) df.loc[len(DISEASE_DIC) + 1, "QUERY3"] = len(IdList_QUERY3) outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY3.txt") with open(outfile, 'w') as f: print("\n".join(IdList_QUERY3), file=f) f.close() IdList_QUERY4 = list(set(IdList_QUERY4)) df.loc[len(DISEASE_DIC) + 1, "QUERY4"] = len(IdList_QUERY4) outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY4.txt") with open(outfile, 'w') as f: print("\n".join(IdList_QUERY4), file=f) f.close() IdList_QUERY5 = list(set(IdList_QUERY5)) df.loc[len(DISEASE_DIC) + 1, "QUERY5"] = len(IdList_QUERY5) outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY5.txt") with open(outfile, 'w') as f: print("\n".join(IdList_QUERY5), file=f) f.close() #ESCREVER TODOS OS RESULTADOS PARA UM ARQUIVO EXCEL writer = pd_ExcelWriter(pathjoin(OUTDIR, 'search_NCBI_RESULT.xlsx'), engine='xlsxwriter') df.to_excel(writer, sheet_name='counts') COMMAND.to_excel(writer, sheet_name='command_lines') QUERY_description.to_excel(writer, sheet_name='query_description') writer.save() return (pathjoin(osgetcwd(), OUTDIR))
def efetch_found_bioprojects(OUTDIR): def printProgressBar (iteration, total, prefix = '', suffix = '', \ decimals = 1, length = 100, fill = '█'): """ Call in a loop to create terminal progress bar @params: iteration - Required : current iteration (Int) total - Required : total iterations (Int) prefix - Optional : prefix string (Str) suffix - Optional : suffix string (Str) decimals - Optional : positive number of decimals in percent \ complete (Int) length - Optional : character length of bar (Int) fill - Optional : bar fill character (Str) """ percent = ("{0:." + str(decimals) + "f}")\ .format(100 * (iteration / float(total))) filledLength = int(length * iteration // total) bar = fill * filledLength + '-' * (length - filledLength) print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r') # Print New Line on Complete if iteration == total: print() """ COLETAR INFORMAÇOES SOBRE BIOPROJECTS ECONTRADOS """ if pathexists(OUTDIR): for DIR in ['Bioprojects', 'Bioprojects/xml']: if not pathexists(pathjoin(OUTDIR, DIR)): osmkdir(pathjoin(OUTDIR, DIR)) path_to_list = pathjoin(OUTDIR, 'IdListDIR/IdList_total.txt') if isfile(path_to_list): with open(path_to_list, 'r') as f: IdList_total = list(filter(None, f.read().splitlines())) else: print('File '+f+' was not found. Run esearch_disease(OUTDIR) '\ 'for making it.') exit() else: print('Directory '+pathjoin(OUTDIR)+' is not accessible. Did you run'\ 'esearch_disease() previously? If not, do it and try again.') exit() df2 = pd_DataFrame(index=range(0, len(IdList_total)), columns=range(0, 7)) df2.columns = [ "ID", "accession", "GEO", "title", "abstract", "disease", "COD" ] df2["ID"] = IdList_total print("\n\n") # ESSE PRINT SERVE PARA DISTANCIAR A BARRA DE PROCESSAMENTO # QUE VEM LOGO ABAIXO DENTRO DO LOOPING # prepare bar progress l = len(IdList_total) i = 0 printProgressBar(0, l, prefix='Download:', suffix='Complete', length=50) RECALL = [] # if download fails, the ID is stored in RECALL DIC_ID = {} for ID in IdList_total: try: handle = Entrez.efetch(db="bioproject", id=ID) except: RECALL += [ID] print('handle = Entrez.efetch(db="bioproject", id=' + ID + ')\tFAILED') continue # avoid catastrophic event in case NCBI fails to give # the informatio for one ID try: record = handle.read() root = ET.fromstring(record) DIC = root.find(".//ProjectID/ArchiveID").attrib DIC_ID[DIC['accession']] = DIC_ID.get(DIC['accession'], DIC['id']) outfile=pathjoin(OUTDIR,'Bioprojects/xml',DIC['accession']+\ '_'+DIC['id']+'.xml') #print(outfile) with open(outfile, "w", encoding="utf-8") as f: print(record, file=f) except: RECALL += [ID] print('FAILED to process ' + ID + ' during the first trial') continue printProgressBar(i + 1, l, prefix='Download:', suffix='Complete', length=50) i += 1 # RECALL for failure IDs if len(RECALL) > 0: print("\n\nFailure to download IDs. STARTING RECALL.") l = len(RECALL) i = 0 printProgressBar(0, l, prefix='Download:', suffix='Complete', length=50) RECALL2 = [] for ID in RECALL: try: handle = Entrez.efetch(db="bioproject", id=ID) except: RECALL2 += [ID] print('handle = Entrez.efetch(db="bioproject", id='+ID+')'\ '\tFAILED in RECALL') continue try: record = handle.read() root = ET.fromstring(record) DIC = root.find(".//ProjectID/ArchiveID").attrib DIC_ID[DIC['accession']] = DIC_ID.get(DIC['accession'], DIC['id']) outfile=pathjoin(OUTDIR,'Bioprojects/xml',DIC['accession']+\ '_'+DIC['id']+'.xml') #print(outfile) with open(outfile, "w", encoding="utf-8") as f: print(record, file=f) except: RECALL2 += [ID] print('FAILED to process ' + ID + ' during the RECALL') continue printProgressBar(i + 1, l, prefix='RECALL:', suffix='Complete', length=50) i += 1 if len(RECALL2) > 0: outfile = pathjoin(OUTDIR, 'Bioprojects/', 'RECALL_failure.txt') open(outfile, 'w').write(str(RECALL2)) print("It was not possible to get ID even during the RECALL\nYou"\ "can find the problematic IDs on file:\n"+outfile) outfile = pathjoin(OUTDIR, 'Bioprojects/', 'dict_ID_ACC.txt') open(outfile, 'w').write(str(DIC_ID))
def eventstudy(self, data=None, model='m', estwin=100, gap=50, evtwins=-10, evtwine=10, minval=70, output='df'): """ Paramaters passed to the event study method. data = event data (event date & permno combinations) model = madj (market-adjusted model) m (market model) ff (fama french) ffm (fama french with momentum factor) estwin = estimation window gap = gap between estimation window and event window evtwins = days preceding event date to begin event window evtwine = days after event date to close the event window minval = minimum number of non-missing return observations (per event) to be regressed on output = output format of the event study results xls (output an excel file to output path) csv (output a csv file to output path) json (output a json file to output path) df (returns a dictionary of pandas dataframes) print (outputs results to the console - not available via qsub) """ #################################################################################### # STEP 1 - SET ESTIMATION, EVENT, AND GAP WINDOWS AND GRAB DATA FROM EVENTS FILE # #################################################################################### estwins = (estwin + gap + np_abs(evtwins)) # Estimation window start estwine = (gap + np_abs(evtwins) + 1) # Estimation window end evtwinx = ( estwins + 1 ) # evt time value (0=event date, -10=window start, 10=window end) evtwins = np_abs( evtwins ) # convert the negative to positive as we will use lag function) evtrang = (evtwins + evtwine + 1 ) # total event window days (lag + lead + the day itself) """ With the event date as a fixed point, calculate the number of days needed to pass to sql lag and lead functions to identify estimation window, gap, and event window. evtwins: event date minus number of preceding days ("event date" - "number of days before event to start [evtwins parameter]") evtwine: event date plus number of following days ("event date" + "number of days after event to end [evtwine parameter]") gap: number of days between the end of the "estimation window" and the beginning of the "event window" estwins: start date of the estimation window ("event date" - "number of days before event to start [evtwins parameter]" - "number of days in gap [gap parameter]" - "number of days in estimation window [estwin parameter]") evtrang: entire time range of the event study even from estimate start, through gap, until event window end (evtwins + evtwine + 1) """ # default the event data in case it was not passed, otherwise read what was passed evtdata = [{"edate": "05/29/2012", "permno": "10002"}] if data is not None: evtdata = json_dumps(data) # init values wrapped up to be passed to sql statement params = { 'estwins': estwins, 'estwine': estwine, 'evtwins': evtwins, 'evtwine': evtwine, 'evtwinx': evtwinx, 'evtdata': evtdata } ############################################# # STEP 2 - GET RETURNS DATA FROM POSTGRES # ############################################# # Create a database connection wconn = self.connect() ############################################################################## # Get the initial data from the database and put it in a pandas dataframe # ############################################################################## # create a pandas dataframe that will hold data df = wconn.raw_sql(""" SELECT a.*, x.*, c.date as rdate, c.ret as ret1, (f.mktrf+f.rf) as mkt, f.mktrf, f.rf, f.smb, f.hml, f.umd, (1+c.ret)*(coalesce(d.dlret,0.00)+1)-1-(f.mktrf+f.rf) as exret, (1+c.ret)*(coalesce(d.dlret,0.00)+1)-1 as ret, case when c.date between a.estwin1 and a.estwin2 then 1 else 0 end as isest, case when c.date between a.evtwin1 and a.evtwin2 then 1 else 0 end as isevt, case when c.date between a.evtwin1 and a.evtwin2 then (rank() OVER (PARTITION BY x.evtid ORDER BY c.date)-%(evtwinx)s) else (rank() OVER (PARTITION BY x.evtid ORDER BY c.date)) end as evttime FROM ( SELECT date, lag(date, %(estwins)s ) over (order by date) as estwin1, lag(date, %(estwine)s ) over (order by date) as estwin2, lag(date, %(evtwins)s ) over (order by date) as evtwin1, lead(date, %(evtwine)s ) over (order by date) as evtwin2 FROM crsp_a_stock.dsi ) as a JOIN (select to_char(x.edate, 'ddMONYYYY') || trim(to_char(x.permno,'999999999')) as evtid, x.permno, x.edate from json_to_recordset('%(evtdata)s') as x(edate date, permno int) ) as x ON a.date=x.edate JOIN crsp_a_stock.dsf c ON x.permno=c.permno AND c.date BETWEEN a.estwin1 and a.evtwin2 JOIN ff_all.factors_daily f ON c.date=f.date LEFT JOIN crsp_a_stock.dsedelist d ON x.permno=d.permno AND c.date=d.dlstdt WHERE f.mktrf is not null AND c.ret is not null ORDER BY x.evtid, x.permno, a.date, c.date """ % params) # Columns coming from the database query df.columns = [ 'date', 'estwin1', 'estwin2', 'evtwin1', 'evtwin2', 'evtid', 'permno', 'edate', 'rdate', 'ret1', 'mkt', 'mktrf', 'rf', 'smb', 'hml', 'umd', 'exret', 'ret', 'isest', 'isevt', 'evttime' ] # Additional columns that will hold computed values (post-query) addcols = [ 'RMSE', 'INTERCEPT', 'var_estp', 'expret', 'abret', 'alpha', '_nobs', '_p_', '_edf_', 'rsq', 'cret', 'cexpret', 'car', 'scar', 'sar', 'pat_scale', 'bhar', 'lastevtwin', 'cret_edate', 'scar_edate', 'car_edate', 'bhar_edate', 'pat_scale_edate', 'xyz' ] # Add them to the dataframe for c in addcols: if c == 'lastevtwin': df[c] = 0 else: df[c] = np_nan ################################################################################### # STEP 3 - FOR EACH EVENT, CALCULATE ABNORMAL RETURN BASED ON CHOSEN RISK MODEL # ################################################################################### # Loop on every category for evt in data: permno = evt['permno'] xdate = evt['edate'] edate = datetime.strptime(xdate, "%m/%d/%Y").date() est_mask = (df['permno'] == permno) & (df['edate'] == edate) & ( df['isest'] == 1) evt_mask = (df['permno'] == permno) & (df['edate'] == edate) & ( df['isevt'] == 1) ####################################################### # Check to see it meets the min obs for est window # ####################################################### _nobs = df["ret"][est_mask].count() # Only carry out the analysis if the number of obsevations meets the minimum threshold if _nobs >= minval: ####################################################### # Regression based on model choices='' # ####################################################### # Market-Adjusted Model if model == 'madj': # Set y to the estimation window records y = df["exret"][est_mask] # Calculate mean and standard deviation of returns for the estimation period mean = np_mean(y) stdv = np_std(y, ddof=1) # Update the columns in the original dataframe (reusing the names from SAS code to help with continuity) df.loc[evt_mask, 'INTERCEPT'] = mean df.loc[evt_mask, 'RMSE'] = stdv df.loc[evt_mask, '_nobs'] = len(y) df.loc[evt_mask, 'var_estp'] = stdv**2 df.loc[evt_mask, 'alpha'] = mean df.loc[evt_mask, 'rsq'] = 0 df.loc[evt_mask, '_p_'] = 1 df.loc[evt_mask, '_edf_'] = (len(y) - 1) df.loc[evt_mask, 'expret'] = df.loc[evt_mask, 'mkt'] df.loc[evt_mask, 'abret'] = df.loc[evt_mask, 'exret'] df_est = df[est_mask] _nobs = len(df_est[df_est.ret.notnull()]) nloc = {'const': 0} def f_cret(row): tmp = ((row['ret'] * nloc['const']) + (row['ret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret, axis=1) df.loc[evt_mask, 'cret_edate'] = nloc['const'] nloc = {'const': 0} def f_cexpret(row): tmp = ((row['expret'] * nloc['const']) + (row['expret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret, axis=1) nloc = {'const': 0} def f_car(row): tmp = (row['abret'] + nloc['const']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1) df.loc[evt_mask, 'car_edate'] = nloc['const'] nloc = {'const': 0} def f_sar(row): tmp = (row['abret'] / np_sqrt(row['var_estp'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1) df.loc[evt_mask, 'sar_edate'] = nloc['const'] nloc = {'const': 0, 'evtrang': evtrang} def f_scar(row): tmp = (row['car'] / np_sqrt( (evtrang * row['var_estp']))) nloc['const'] = tmp return tmp df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar, axis=1) df.loc[evt_mask, 'scar_edate'] = nloc['const'] nloc = {'const': 0} def f_bhar(row): tmp = (row['cret'] - row['cexpret']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar, axis=1) df.loc[evt_mask, 'bhar_edate'] = nloc['const'] df.loc[evt_mask, 'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00) df.loc[evt_mask, 'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00) # Market Model elif model == 'm': # Set y to the estimation window records X = df["mktrf"][est_mask] y = df["ret"][est_mask] # Fit an OLS model with intercept on mktrf X = sm_add_constant(X) est = sm_OLS(y, X).fit() # Set the variables from the output df_est = df[(df['permno'] == permno) & (df['edate'] == edate) & (df['isest'] == 1)] _nobs = len( df_est[df_est.ret.notnull()]) # not null observations # aggregate variables # cret_edate = np_nan # scar_edate = np_nan # car_edate = np_nan # bhar_edate = np_nan # pat_scale_edate = np_nan alpha = est.params.__getitem__('const') beta1 = est.params.__getitem__('mktrf') df.loc[evt_mask, 'INTERCEPT'] = alpha df.loc[evt_mask, 'alpha'] = alpha df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid) df.loc[evt_mask, '_nobs'] = _nobs df.loc[evt_mask, 'var_estp'] = est.mse_resid df.loc[evt_mask, 'rsq'] = est.rsquared df.loc[evt_mask, '_p_'] = 2 df.loc[evt_mask, '_edf_'] = (len(y) - 2) nloc = {'alpha': alpha, 'beta1': beta1, 'const': 0} def f_expret(row): return (nloc['alpha'] + (nloc['beta1'] * row['mktrf'])) df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret, axis=1) nloc = {'alpha': alpha, 'beta1': beta1, 'const': 0} def f_abret(row): return (row['ret'] - (nloc['alpha'] + (nloc['beta1'] * row['mktrf']))) df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret, axis=1) nloc = {'const': 0} def f_cret(row): tmp = ((row['ret'] * nloc['const']) + (row['ret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret, axis=1) df.loc[evt_mask, 'cret_edate'] = nloc['const'] nloc = {'const': 0} def f_cexpret(row): tmp = ((row['expret'] * nloc['const']) + (row['expret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret, axis=1) nloc = {'const': 0} def f_car(row): # nonlocal const tmp = (row['abret'] + nloc['const']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1) df.loc[evt_mask, 'car_edate'] = nloc['const'] nloc = {'const': 0} def f_sar(row): tmp = (row['abret'] / np_sqrt(row['var_estp'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1) df.loc[evt_mask, 'sar_edate'] = nloc['const'] nloc = {'const': 0, 'evtrang': evtrang} def f_scar(row): tmp = (row['car'] / np_sqrt( (evtrang * row['var_estp']))) nloc['const'] = tmp return tmp df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar, axis=1) df.loc[evt_mask, 'scar_edate'] = nloc['const'] nloc = {'const': 0} def f_bhar(row): tmp = (row['cret'] - row['cexpret']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar, axis=1) df.loc[evt_mask, 'bhar_edate'] = nloc['const'] df.loc[evt_mask, 'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00) df.loc[evt_mask, 'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00) # Fama-French Three Factor Model elif model == 'ff': # Set y to the estimation window records df_est = df[(df['permno'] == permno) & (df['edate'] == edate) & (df['isest'] == 1)] X = df_est[['smb', 'hml', 'mktrf']] y = df_est['ret'] # Fit an OLS model with intercept on mktrf, smb, hml X = sm_add_constant(X) est = sm_OLS(y, X).fit() # est = smf.ols(formula='ret ~ smb + hml + mktrf', data=df_est).fit() alpha = est.params.__getitem__('const') beta1 = est.params.__getitem__('mktrf') beta2 = est.params.__getitem__('smb') beta3 = est.params.__getitem__('hml') df.loc[evt_mask, 'INTERCEPT'] = alpha df.loc[evt_mask, 'alpha'] = alpha df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid) df.loc[evt_mask, '_nobs'] = _nobs df.loc[evt_mask, 'var_estp'] = est.mse_resid df.loc[evt_mask, 'rsq'] = est.rsquared df.loc[evt_mask, '_p_'] = 2 df.loc[evt_mask, '_edf_'] = (len(y) - 2) nloc = { 'alpha': alpha, 'beta1': beta1, 'beta2': beta2, 'beta3': beta3, 'const': 0 } def f_expret(row): return ((nloc['alpha'] + (nloc['beta1'] * row['mktrf']) + (nloc['beta2'] * row['smb']) + (nloc['beta3'] * row['hml']))) df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret, axis=1) nloc = { 'alpha': alpha, 'beta1': beta1, 'beta2': beta2, 'beta3': beta3, 'const': 0 } def f_abret(row): return (row['ret'] - ((nloc['alpha'] + (nloc['beta1'] * row['mktrf']) + (nloc['beta2'] * row['smb']) + (nloc['beta3'] * row['hml'])))) df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret, axis=1) nloc = {'const': 0} def f_cret(row): tmp = ((row['ret'] * nloc['const']) + (row['ret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret, axis=1) df.loc[evt_mask, 'cret_edate'] = nloc['const'] nloc = {'const': 0} def f_cexpret(row): tmp = ((row['expret'] * nloc['const']) + (row['expret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret, axis=1) nloc = {'const': 0} def f_car(row): tmp = (row['abret'] + nloc['const']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1) df.loc[evt_mask, 'car_edate'] = nloc['const'] nloc = {'const': 0} def f_sar(row): tmp = (row['abret'] / np_sqrt(row['var_estp'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1) df.loc[evt_mask, 'sar_edate'] = nloc['const'] nloc = {'const': 0, 'evtrang': evtrang} def f_scar(row): tmp = (row['car'] / np_sqrt( (evtrang * row['var_estp']))) nloc['const'] = tmp return tmp df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar, axis=1) df.loc[evt_mask, 'scar_edate'] = nloc['const'] nloc = {'const': 0} def f_bhar(row): tmp = (row['cret'] - row['cexpret']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar, axis=1) df.loc[evt_mask, 'bhar_edate'] = nloc['const'] df.loc[evt_mask, 'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00) df.loc[evt_mask, 'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00) # Fama-French Plus Momentum elif model == 'ffm': # Set y to the estimation window records df_est = df[(df['permno'] == permno) & (df['edate'] == edate) & (df['isest'] == 1)] X = df_est[['mktrf', 'smb', 'hml', 'umd']] # indicator variables y = df_est['ret'] # response variables # Fit an OLS (ordinary least squares) model with intercept on mktrf, smb, hml, and umd X = sm_add_constant(X) est = sm_OLS(y, X).fit() alpha = est.params.__getitem__('const') beta1 = est.params.__getitem__('mktrf') beta2 = est.params.__getitem__('smb') beta3 = est.params.__getitem__('hml') beta4 = est.params.__getitem__('umd') df.loc[evt_mask, 'INTERCEPT'] = alpha df.loc[evt_mask, 'alpha'] = alpha df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid) df.loc[evt_mask, '_nobs'] = _nobs df.loc[evt_mask, 'var_estp'] = est.mse_resid df.loc[evt_mask, 'rsq'] = est.rsquared df.loc[evt_mask, '_p_'] = 2 df.loc[evt_mask, '_edf_'] = (len(y) - 2) nloc = { 'alpha': alpha, 'beta1': beta1, 'beta2': beta2, 'beta3': beta3, 'beta4': beta4, 'const': 0 } def f_expret(row): return ((nloc['alpha'] + (nloc['beta1'] * row['mktrf']) + (nloc['beta2'] * row['smb']) + (nloc['beta3'] * row['hml']) + (nloc['beta4'] * row['umd']))) df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret, axis=1) nloc = { 'alpha': alpha, 'beta1': beta1, 'beta2': beta2, 'beta3': beta3, 'beta4': beta4, 'const': 0 } def f_abret(row): return (row['ret'] - ((nloc['alpha'] + (nloc['beta1'] * row['mktrf']) + (nloc['beta2'] * row['smb']) + (nloc['beta3'] * row['hml']) + (nloc['beta4'] * row['umd'])))) df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret, axis=1) nloc = {'const': 0} def f_cret(row): tmp = ((row['ret'] * nloc['const']) + (row['ret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret, axis=1) df.loc[evt_mask, 'cret_edate'] = nloc['const'] nloc = {'const': 0} def f_cexpret(row): tmp = ((row['expret'] * nloc['const']) + (row['expret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret, axis=1) nloc = {'const': 0} def f_car(row): tmp = (row['abret'] + nloc['const']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1) df.loc[evt_mask, 'car_edate'] = nloc['const'] nloc = {'const': 0} def f_sar(row): tmp = (row['abret'] / np_sqrt(row['var_estp'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1) df.loc[evt_mask, 'sar_edate'] = nloc['const'] nloc = {'const': 0, 'evtrang': evtrang} def f_scar(row): tmp = (row['car'] / np_sqrt( (evtrang * row['var_estp']))) nloc['const'] = tmp return tmp df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar, axis=1) df.loc[evt_mask, 'scar_edate'] = nloc['const'] nloc = {'const': 0} def f_bhar(row): tmp = (row['cret'] - row['cexpret']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar, axis=1) df.loc[evt_mask, 'bhar_edate'] = nloc['const'] df.loc[evt_mask, 'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00) df.loc[evt_mask, 'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00) # Something erroneous was passed else: df['isest'][evt_mask] = -2 ################################# # STEP 4 - OUTPUT THE RESULTS # ################################# df_sta = df[df['isevt'] == 1] levt = df_sta['evttime'].unique() columns = [ 'evttime', 'car_m', 'ret_m', 'abret_m', 'abret_t', 'sar_t', 'pat_ar', 'cret_edate_m', 'car_edate_m', 'pat_car_edate_m', 'car_edate_t', 'scar_edate_t', 'bhar_edate_m' ] idxlist = list(levt) df_stats = pd_DataFrame(index=idxlist, columns=columns) df_stats = df_stats.fillna(0.00000000) # with 0s rather than NaNs # Event df_stats['evttime'] = df_sta.groupby(['evttime'])['evttime'].unique() # Means df_stats['abret_m'] = df_sta.groupby(['evttime'])['abret'].mean() df_stats['bhar_edate_m'] = df_sta.groupby(['evttime' ])['bhar_edate'].mean() df_stats['car_edate_m'] = df_sta.groupby(['evttime' ])['car_edate'].mean() df_stats['car_m'] = df_sta.groupby(['evttime'])['car'].mean() df_stats['cret_edate_m'] = df_sta.groupby(['evttime' ])['cret_edate'].mean() df_stats['pat_scale_m'] = df_sta.groupby(['evttime' ])['pat_scale'].mean() df_stats['pat_car_edate_mean'] = 0 df_stats['ret_m'] = df_sta.groupby(['evttime'])['ret'].mean() df_stats['sar_m'] = df_sta.groupby(['evttime'])['sar'].mean() df_stats['scar_edate_m'] = df_sta.groupby(['evttime' ])['scar_edate'].mean() df_stats['scar_m'] = df_sta.groupby(['evttime'])['scar'].mean() # Standard deviations df_stats['car_v'] = df_sta.groupby(['evttime'])['car'].std() df_stats['abret_v'] = df_sta.groupby(['evttime'])['abret'].std() df_stats['sar_v'] = df_sta.groupby(['evttime'])['sar'].std() df_stats['pat_scale_v'] = df_sta.groupby(['evttime' ])['pat_scale'].std() df_stats['car_edate_v'] = df_sta.groupby(['evttime' ])['car_edate'].std() df_stats['scar_edate_v'] = df_sta.groupby(['evttime' ])['scar_edate'].std() df_stats['scar_v'] = df_sta.groupby(['evttime'])['scar'].std() # Counts df_stats['scar_n'] = df_sta.groupby(['evttime'])['scar'].count() df_stats['scar_edate_n'] = df_sta.groupby(['evttime' ])['scar_edate'].count() df_stats['sar_n'] = df_sta.groupby(['evttime'])['sar'].count() df_stats['car_n'] = df_sta.groupby(['evttime'])['car'].count() df_stats['n'] = df_sta.groupby(['evttime'])['evttime'].count() # Sums df_stats['pat_scale_edate_s'] = df_sta.groupby( ['evttime'])['pat_scale_edate'].sum() df_stats['pat_scale_s'] = df_sta.groupby(['evttime' ])['pat_scale'].sum() # T statistics 1 def tstat(row, m, v, n): return row[m] / (row[v] / np_sqrt(row[n])) df_stats['abret_t'] = df_stats.apply(tstat, axis=1, args=('abret_m', 'abret_v', 'n')) df_stats['sar_t'] = df_stats.apply(tstat, axis=1, args=('sar_m', 'sar_v', 'n')) df_stats['car_edate_t'] = df_stats.apply(tstat, axis=1, args=('car_edate_m', 'car_edate_v', 'n')) df_stats['scar_edate_t'] = df_stats.apply(tstat, axis=1, args=('scar_edate_m', 'scar_edate_v', 'scar_edate_n')) # T statistics 2 def tstat2(row, m, s, n): return row[m] / (np_sqrt(row[s]) / row[n]) df_stats['pat_car'] = df_stats.apply(tstat2, axis=1, args=('scar_m', 'pat_scale_s', 'scar_n')) df_stats['pat_car_edate_m'] = df_stats.apply(tstat2, axis=1, args=('scar_edate_m', 'pat_scale_edate_s', 'scar_edate_n')) df_stats['pat_ar'] = df_stats.apply(tstat2, axis=1, args=('sar_m', 'pat_scale_s', 'sar_n')) # FILE 2 # EVENT WINDOW df_evtw = df.ix[ (df['isevt'] == 1), ['permno', 'edate', 'rdate', 'evttime', 'ret', 'abret']] df_evtw.sort_values(['permno', 'evttime'], ascending=[True, True]) # FILE 1 # EVENT DATE maxv = max(levt) df_evtd = df.ix[(df['isevt'] == 1) & (df['evttime'] == maxv), ['permno', 'edate', 'cret', 'car', 'bhar']] df_evtd.sort_values(['permno', 'edate'], ascending=[True, True]) if output == 'df': retval = {} retval['event_stats'] = df_stats retval['event_window'] = df_evtw retval['event_date'] = df_evtd return retval elif output == 'print': retval = {} print( tabulate(df_evtd.sort_values(['permno', 'edate'], ascending=[True, True]), headers='keys', tablefmt='psql')) print(tabulate(df_evtw, headers='keys', tablefmt='psql')) print(tabulate(df_stats, headers='keys', tablefmt='psql')) return retval elif output == 'json': retval = {} retval['event_stats'] = df_stats.to_dict(orient='split') retval['event_window'] = df_evtw.to_dict(orient='split') retval['event_date'] = df_evtd.to_dict(orient='split') # Write this to a file with open(os.path.join(self.output_path, 'EventStudy.json'), 'w') as outfile: json_dump(retval, outfile, cls=EncoderJson) # Return the output in case they are doing something programmatically return json_dumps(retval, cls=EncoderJson) elif output == 'csv': retval = '' es = StringIO_StringIO() df_stats.to_csv(es) retval += es.getvalue() ew = StringIO_StringIO() df_evtw.to_csv(ew) retval += "\r" retval += ew.getvalue() ed = StringIO_StringIO() df_evtd.to_csv(ed) retval += ed.getvalue() # write this to a file with open(os.path.join(self.output_path, 'EventStudy.csv'), 'w') as outfile: outfile.write(retval) # return the output in case they are doing something programmatically return retval elif output == 'xls': retval = {} xlswriter = pd_ExcelWriter( os.path.join(self.output_path, 'EventStudy.xls')) df_stats.to_excel(xlswriter, 'Stats') df_evtw.to_excel(xlswriter, 'Event Window') df_evtd.to_excel(xlswriter, 'Event Date') xlswriter.save() return retval else: pass
def analyze_AG_bipartite_network(genes, authors_GB_genes, pub_thresh=1, save_file_name="author_gene_bp.json", plot_flag=False): gene_list = genes.split(',') t0 = time.time() # unpickle groupby object #authors_GB_genes = pd.read_pickle(author_gene_GB_fname) authors_GB_genes = app.authors_GB_genes_loaded # get rid of invalid genes in gene_list new_gene_list = [] for gene in gene_list: if gene in authors_GB_genes: new_gene_list.append(gene) gene_list = new_gene_list # create list of all authors/weights who have published on at least one gene in gene_list AW_list_total = [] for gene in gene_list: AW_list_total.extend(list(authors_GB_genes[gene].index)) AW_list_total = zip(*AW_list_total) author_list_total = AW_list_total[0] weight_list_total = AW_list_total[1] print(time.time() - t0) author_list_total = pd_Series(author_list_total) weight_list_total = pd_Series(weight_list_total, index=author_list_total) # take the mean of duplicate entries df_temp = pd_DataFrame( { 'weight': list(weight_list_total), 'author': list(author_list_total) }, index=range(len(author_list_total))) AW_gb_temp = df_temp.weight.groupby(df_temp['author']).mean() author_list_total = list(AW_gb_temp.index) weight_list_total = list(AW_gb_temp.values) weight_list_total = pd_Series(weight_list_total, index=author_list_total) # make a dataframe, indexed by authors in author_list_total, with columns = entries in gene_list author_gene_df = pd_DataFrame(np.zeros( [len(author_list_total), len(gene_list)]), index=author_list_total, columns=gene_list) print(time.time() - t0) # fill in the dataframe for gene in gene_list: #print(gene) temp = list(authors_GB_genes[gene].index) temp = zip(*temp) authors_temp = list(np.unique(temp[0])) author_gene_df[gene][authors_temp] = weight_list_total[authors_temp] print(time.time() - t0) # add a column for total weight author_gene_df['total_weight'] = np.sum(np.array(author_gene_df), 1) author_gene_df.sort('total_weight', inplace=True, ascending=False) # next, convert this dataframe into bipartite network # make the small bipartite graph author_gene_bp = nx.Graph() # pick out authors which have published on > pub_thresh genes in gene_list index_temp = list(author_gene_df['total_weight'][ author_gene_df['total_weight'] > pub_thresh].index) # only allow 200 authors max if len(index_temp) > 200: author_nodes = index_temp[0:200] else: author_nodes = index_temp #index_temp = list(author_gene_df['total_num'].index) #author_nodes = index_temp[0:num_authors] print(time.time() - t0) for gene in gene_list: for author in author_nodes: # only add a link if connection exists if author_gene_df[gene][author] > 0: author_gene_bp.add_edge(gene, author) # add all genes in gene_list in case none of them come up author_gene_bp.add_nodes_from(gene_list) # now apply clustering algo to the bipartite graph partition = community.best_partition(author_gene_bp) partition = pd_Series(partition) col_temp_authors = partition[author_nodes] col_temp_genes = partition[gene_list] col_temp = partition[author_gene_bp.nodes()] if plot_flag: # plot graph if plot_flag = True plt.figure(figsize=[15, 15]) pos = nx.spring_layout(author_gene_bp, k=.3) #nx.draw(author_gene_bp,pos=pos,alpha=.5,node_size=100,node_color = col_temp,cmap='Paired') gene_list = list(gene_list) nx.draw_networkx_nodes(author_gene_bp, nodelist=author_nodes, node_color=col_temp_authors, cmap='Paired', pos=pos, alpha=.5, node_size=100) nx.draw_networkx_nodes(author_gene_bp, nodelist=gene_list, node_color=col_temp_genes, cmap='Paired', pos=pos, alpha=.5, node_size=200, node_shape='s') nx.draw_networkx_edges(author_gene_bp, pos=pos, alpha=.1) node_subset_dict = dict(zip(index_temp[0:20], index_temp[0:20])) gene_subset_dict = dict(zip(gene_list, gene_list)) temp = node_subset_dict.update(gene_subset_dict) nx.draw_networkx_labels(author_gene_bp, pos=pos, labels=node_subset_dict) # Set up json for saving # what should the colors be?? num_communities = len(np.unique(col_temp)) color_list = plt.cm.gist_rainbow(np.linspace(0, 1, num_communities)) # blend the community colors (so that to-nodes are a mixture of all the communities they belong to) rfrac, gfrac, bfrac = calc_community_fraction(author_gene_bp, author_nodes, gene_list, partition, color_list) # save network in json format nodes = author_gene_bp.nodes() numnodes = len(nodes) edges = author_gene_bp.edges() numedges = len(edges) #nodes_dict = [{"id":n,"com":col_temp[n],"degree":author_gene_bp.degree(n)} for n in nodes] nodes_dict = [{ "id": n, "com": col_temp[n], "degree": author_gene_bp.degree(n), "rfrac": rfrac[n] * 255, "gfrac": gfrac[n] * 255, "bfrac": bfrac[n] * 255 } for n in nodes] node_map = dict(zip( nodes, range(numnodes))) # map to indices for source/target in edges edges_dict = [{ "source": node_map[edges[i][0]], "target": node_map[edges[i][1]] } for i in range(numedges)] #import json json_graph = {"directed": False, "nodes": nodes_dict, "links": edges_dict} #json.dump(json_graph,open(save_file_name,'w')) print(time.time() - t0) return json_graph
hash_results += sub_counts print(s) print('Substrings count by naive search: {}'.format(naive_results)) print('Substrings count by Rabin-Karp with hash(): {}'.format(hash_results)) # 2. Закодируйте любую строку из трех слов по алгоритму Хаффмана. seed(42) message = input('Введите любую строку: ') message_list = list(message) message_symb, message_freq = np_unique(message_list, return_counts=True) df = pd_DataFrame({'s': message_symb, 'f': message_freq}) message_dict = dict(zip(message_symb, ['' for _ in range(len(message_symb))])) while df.shape[0] >= 2: df.sort_values(by=['f'], inplace=True) #by=['f', 's'], ascending=True i0, i1 = choice([[1, 0], [0, 1]]) for s in message_dict: if s in df.iloc[i0].s: message_dict[s] = '0' + message_dict[s] if s in df.iloc[i1].s: message_dict[s] = '1' + message_dict[s] df = df.append(df.iloc[0:2].sum(), ignore_index=True) df = df.iloc[2:] coded_message = message for s in message_dict: