Esempio n. 1
0
def get_task_results(context):
    with open('/tmp/aprinto_celery_tail','r') as f:
        x                       =   f.readlines()
    cols                        =   ['task_status','task_name','task_id']
    df                          =   pd_DataFrame(columns=cols)
    for it in x:
        if it.split()[3]=='task':
            t=re_sub('(.*)(\\btask\\b\s)([^:]+)(: document_processing\.\\b)([^\(]+)\(([^\)]+)\)(.*)','\g<3>,\g<5>,\g<6>',it).replace('\n','')
            D                   =   dict(zip(cols,t.split(',')))
            df                  =   df.append(D,ignore_index=True)
    context.celery_results      =   df
    return context
Esempio n. 2
0
def parse_xml(pdf):
    x = soup(codecs_encode(pdf,'utf8','ignore')).findAll('page')
    cols = ['page','font','top','left','width','height','text']
    g = pd_DataFrame(columns=cols)
    for pg in x:
        idx = x.index(pg)+1
        pg = str(pg)
        line_iter = re_findall(r'(<text.*?</text>)',pg)

        for it in line_iter:
            a = ['page']+re_findall('([a-zA-Z]+)+\=', it)+['text']
            text_attrs = it[5:it.find('>')].strip()
            text_contents = str(soup(it).text)
            b = [idx]+map(lambda s: int(s),re_findall('[0-9]+', text_attrs))+[text_contents]
            if text_contents.strip() != '':
                g = g.append(dict(zip(a,b)),ignore_index=True)
    return g
Esempio n. 3
0
    def CHARACTERISTIC_Export(self):
        #找出所有的描述标定量的区域
        Character = re.findall(
            r'/begin *?CHARACTERISTIC([\S\s]*?)/end *?CHARACTERISTIC',
            self.content, re.M | re.I)

        CharacterLen = len(Character)
        self.CharLength = len(Character)
        #创建一个空白dataframe用于存放所有标定量信息
        DFCharacter = pd_DataFrame()
        #创建一个空白list用于存放Charname
        ListCharName = [0] * CharacterLen
        #创建一个空白list用于存放CharType
        ListCharType = [0] * CharacterLen
        #创建一个空白list用于存放CharAdd
        ListCharAdd = [0] * CharacterLen
        #创建一个空白list用于存放CharDataType
        ListChaDataType = [0] * CharacterLen
        #创建一个空白list用于存放CharConv
        ListCharConv = [0] * CharacterLen
        #创建一个空白list用于存放CharNum
        ListCharPNum = [1] * CharacterLen
        #创建一个空白list用于存放CharNum
        ListCharMax = [0] * CharacterLen
        #创建一个空白list用于存放CharNum
        ListCharMin = [0] * CharacterLen

        #创建一个空白list用于存放ListXaxisType X轴曲线类型
        ListXaxisType = [0] * CharacterLen
        #创建一个空白list用于存放ListXaxisPNumDataType  曲线类型STD_AXIS时,X轴点数的数据类型
        ListXaxisPNumDataType = [0] * CharacterLen
        #创建一个空白list用于存放ListXaxisOffset
        ListXaxisOffset = [0] * CharacterLen
        #创建一个空白list用于存放ListXaxisShift
        ListXaxisShift = [0] * CharacterLen
        #创建一个空白list用于存放ListXaxisPNum
        ListXaxisPNum = [1] * CharacterLen
        #创建一个空白list用于存放ListXaxisPRef
        ListXaxisPRef = [0] * CharacterLen
        #创建一个空白list用于存放ListXaxisConv
        ListXaxisConv = [0] * CharacterLen
        #创建一个空白list用于存放CharNum
        ListXaxisMax = [0] * CharacterLen
        #创建一个空白list用于存放CharNum
        ListXaxisMin = [0] * CharacterLen

        #创建一个空白list用于存放ListYaxisType Y轴曲线类型
        ListYaxisType = [0] * CharacterLen
        #创建一个空白list用于存放ListYaxisPNumDataType  曲线类型STD_AXIS时,Y轴点数的数据类型
        ListYaxisPNumDataType = [0] * CharacterLen
        #创建一个空白list用于存放ListYaxisOffset
        ListYaxisOffset = [0] * CharacterLen
        #创建一个空白list用于存放ListYaxisShift
        ListYaxisShift = [0] * CharacterLen
        #创建一个空白list用于存放ListYaxisPNum
        ListYaxisPNum = [1] * CharacterLen
        #创建一个空白list用于存放ListYaxisPRef
        ListYaxisPRef = [0] * CharacterLen
        #创建一个空白list用于存放LisYaxisConv
        ListYaxisConv = [0] * CharacterLen
        #创建一个空白list用于存放CharNum
        ListYaxisMax = [0] * CharacterLen
        #创建一个空白list用于存放CharNum
        ListYaxisMin = [0] * CharacterLen

        #处理具体一个的标定量块Name Long-Identifier Type Address Deposit MaxDiff
        #Conversion Lower-Limit Upper-Limit
        for i in range(CharacterLen):
            #去除其中某个标定量块的前后空白字符
            CharChunk = Character[i].strip()

            #将所有“....”都替换成“Description”,去除''中空格的作用
            CharChunk = re.sub(r"\".*?(?<!\\)\"", "Description", CharChunk)

            #将标定量块以空白符分隔
            TempList = re.split('[\s]*', CharChunk)
            #name
            ListCharName[i] = TempList[0]
            #ListCharType
            ListCharType[i] = TempList[2]
            #添加排序字母,方便excel排序
            if (ListCharType[i] == 'VALUE'):
                ListCharType[i] = 'a_VALUE'
            if (ListCharType[i] == 'VAL_BLK'):
                ListCharType[i] = 'b_VAL_BLK'
            if (ListCharType[i] == 'ASCII'):
                ListCharType[i] = 'c_ASCII'
            if (ListCharType[i] == 'CURVE'):
                ListCharType[i] = 'd_CURVE'
            if (ListCharType[i] == 'MAP'):
                ListCharType[i] = 'e_MAP'

            #ListCharAdd
            ListCharAdd[i] = int(TempList[3], 16)
            #ListChaDataType
            ListChaDataType[i] = TempList[4]
            #ListCharConv
            ListCharConv[i] = TempList[6]
            #ListCharMax
            ListCharMax[i] = TempList[8]
            #ListCharMin
            ListCharMin[i] = TempList[7]

            #数组类型的数据暂时看到有2中表示数组个数的方式,MATRIX_DIM跟NUMBER
            if (TempList[2] == 'b_VAL_BLK'):
                #将标定量块以行分隔
                TempList = re.split('[\n]*', CharChunk)
                for j in TempList:
                    #去除其中行前后空白字符
                    j = j.strip()
                    #将行以空白符分隔
                    TempLineList = re.split('[\s]*', j)
                    if TempLineList[0] == 'MATRIX_DIM':
                        PointNumber = int(TempLineList[1]) * int(
                            TempLineList[2]) * int(TempLineList[3])
                        ListCharPNum[i] = PointNumber
                    if TempLineList[0] == 'NUMBER':
                        ListCharPNum[i] = int(TempLineList[1])
            elif (TempList[2] == 'c_ASCII'):
                TempList = re.split('[\n]*', CharChunk)
                for j in TempList:
                    # 去除其中行前后空白字符
                    j = j.strip()
                    # 将行以空白符分隔
                    TempLineList = re.split('[\s]*', j)
                    if TempLineList[0] == 'NUMBER':
                        ListCharPNum[i] = int(TempLineList[1])
            #曲线类型
            elif (ListCharType[i] == 'd_CURVE') or (ListCharType[i]
                                                    == 'e_MAP'):
                #从CharChunk中提取AXIS_DESCR块
                AxisDescrChunk = re.findall(
                    r'/begin *?AXIS_DESCR([\S\s]*?)/end *?AXIS_DESCR',
                    CharChunk, re.M | re.I)
                #Attribute InputQuantity Conversion MaxAxisPoints LowerLimit UpperLimit
                #去除其中行前后空白字符
                AxisDescrChunk[0] = AxisDescrChunk[0].strip()
                #将X轴描述块以空白符分隔
                TempList = re.split('[\s]*', AxisDescrChunk[0])
                ListXaxisType[i] = TempList[0]
                ListXaxisConv[i] = TempList[2]
                #ListXaxisMax
                ListXaxisMax[i] = TempList[5]
                #ListXaxisMin
                ListXaxisMin[i] = TempList[4]

                #如果X轴类型为标准轴
                if TempList[0] == 'STD_AXIS':
                    ListXaxisPNumDataType[i] = ListChaDataType[i]
                    #暂时以最大轴点数来确定轴点数,而不是从内存里去读取点数
                    ListXaxisPNum[i] = int(TempList[3])
                #如果X轴类型为等间距轴
                if TempList[0] == 'FIX_AXIS':
                    #将X轴描述块从新以换行符分隔
                    TempList = re.split('[\n]*', AxisDescrChunk[0])
                    for j in TempList:
                        j = j.strip()
                        #将行以空白符分隔
                        TempLineList = re.split('[\s]*', j)
                        if TempLineList[0] == 'FIX_AXIS_PAR':
                            ListXaxisOffset[i] = int(TempLineList[1])
                            ListXaxisShift[i] = int(TempLineList[2])
                            ListXaxisPNum[i] = int(TempLineList[3])
                if TempList[0] == 'COM_AXIS':
                    #将X轴描述块从新以换行符分隔
                    TempList = re.split('[\n]*', AxisDescrChunk[0])
                    for j in TempList:
                        j = j.strip()
                        #将行以空白符分隔
                        TempLineList = re.split('[\s]*', j)
                        if TempLineList[0] == 'AXIS_PTS_REF':
                            ListXaxisPRef[i] = TempLineList[1]
                #ListCharPNum[i] = ListXaxisPNum[i]

                if ListCharType[i] == 'e_MAP':
                    #去除其中行前后空白字符
                    AxisDescrChunk[1] = AxisDescrChunk[1].strip()
                    #将X轴描述块以空白符分隔
                    TempList = re.split('[\s]*', AxisDescrChunk[1])
                    ListYaxisType[i] = TempList[0]
                    ListYaxisConv[i] = TempList[2]
                    #ListYaxisMax
                    ListYaxisMax[i] = TempList[5]
                    #ListYaxisMin
                    ListYaxisMin[i] = TempList[4]
                    #如果X轴类型为标准轴
                    if TempList[0] == 'STD_AXIS':
                        ListYaxisPNumDataType[i] = ListChaDataType[i]
                        #暂时以最大轴点数来确定轴点数,而不是从内存里去读取点数
                        ListYaxisPNum[i] = int(TempList[3])
                    #如果X轴类型为等间距轴
                    if TempList[0] == 'FIX_AXIS':
                        #将X轴描述块从新以换行符分隔
                        TempList = re.split('[\n]*', AxisDescrChunk[1])
                        for j in TempList:
                            j = j.strip()
                            #将行以空白符分隔
                            TempLineList = re.split('[\s]*', j)
                            if TempLineList[0] == 'FIX_AXIS_PAR':
                                ListYaxisOffset[i] = int(TempLineList[1])
                                ListYaxisShift[i] = int(TempLineList[2])
                                ListYaxisPNum[i] = int(TempLineList[3])
                    if TempList[0] == 'COM_AXIS':
                        #将X轴描述块从新以换行符分隔
                        TempList = re.split('[\n]*', AxisDescrChunk[1])
                        for j in TempList:
                            j = j.strip()
                            #将行以空白符分隔
                            TempLineList = re.split('[\s]*', j)
                            if TempLineList[0] == 'AXIS_PTS_REF':
                                ListYaxisPRef[i] = TempLineList[1]
                    #ListCharPNum[i] = ListXaxisPNum[i] * ListYaxisPNum[i]

        DFCharacter['CharName'] = ListCharName
        DFCharacter['CharType'] = ListCharType
        DFCharacter['CharAdd'] = ListCharAdd
        DFCharacter['ChaDataType'] = ListChaDataType
        DFCharacter['CharConv'] = ListCharConv
        DFCharacter['CharPNum'] = ListCharPNum
        DFCharacter['CharMin'] = ListCharMin
        DFCharacter['CharMax'] = ListCharMax

        DFCharacter['XaxisType'] = ListXaxisType
        DFCharacter['XaxisPNumDataType'] = ListXaxisPNumDataType
        DFCharacter['XaxisOffset'] = ListXaxisOffset
        DFCharacter['XaxisShift'] = ListXaxisShift
        DFCharacter['XaxisPNum'] = ListXaxisPNum
        DFCharacter['XaxisPRef'] = ListXaxisPRef
        DFCharacter['XaxisConv'] = ListXaxisConv
        DFCharacter['XaxisMin'] = ListXaxisMin
        DFCharacter['XaxisMax'] = ListXaxisMax

        DFCharacter['YaxisType'] = ListYaxisType
        DFCharacter['YaxisPNumDataType'] = ListYaxisPNumDataType
        DFCharacter['YaxisOffset'] = ListYaxisOffset
        DFCharacter['YaxisShift'] = ListYaxisShift
        DFCharacter['YaxisPNum'] = ListYaxisPNum
        DFCharacter['YaxisPRef'] = ListYaxisPRef
        DFCharacter['YaxisConv'] = ListYaxisConv
        DFCharacter['YaxisMin'] = ListYaxisMin
        DFCharacter['YaxisMax'] = ListYaxisMax
        return DFCharacter
Esempio n. 4
0
    def COMPU_METHOD_Export(self):

        #找出所有的描述转换公式的区域
        ConvMoth = re.findall(
            r'/begin *?COMPU_METHOD([\S\s]*?)/end *?COMPU_METHOD',
            self.content, re.M | re.I)

        #创建一个空白dataframe用于存放所有数据类型信息
        DFConvInfo = pd_DataFrame()
        #创建一个空白list用于存放name
        ListConvName = []
        #创建一个空白list用于存放unit
        ListUnit = []
        #创建一个空白list用于存放A
        ListA = []
        #创建一个空白list用于存放B
        ListB = []
        #创建一个空白list用于存放C
        ListC = []
        #创建一个空白list用于存放D
        ListD = []
        #创建一个空白list用于存放E
        ListE = []
        #创建一个空白list用于存放F
        ListF = []

        #处理具体一个的转换公式块
        for i in ConvMoth:
            #去除其中某个描述转换公式块的前后空白字符
            i = i.strip()

            #将所有“  ”都替换成“Description”,去除''中空格的作用
            # i = re.sub(r"\"\s+\"", "Description", i)

            #将描述转换公式块以空白符分隔
            TempList = re.split('[\s]*', i)
            #name
            ListConvName.append(TempList[0])

            #unit
            # try:
            # print TempList[4]
            TempList[4] = TempList[4].decode('gbk')
            # print u'%s'%(TempList[4])
            # except:
            #     pass
            ListUnit.append(TempList[4])

            #暂时只做RAT_FUNC类型的公式
            if TempList[2] == 'RAT_FUNC':
                #重新将描述转换公式块以换行符分隔
                ConvInfoList = re.split('[\n]*', i)
                #对分隔后的行进行选择性提取
                for j in ConvInfoList:
                    #去除行前后的空格
                    j = j.strip()
                    #将行按空格进行分隔
                    ListLine = j.split()
                    #如果读到COEFFS开头的行,则提取数据
                    if ListLine[0] == 'COEFFS':
                        ListA.append(float(ListLine[1]))
                        ListB.append(float(ListLine[2]))
                        ListC.append(float(ListLine[3]))
                        ListD.append(float(ListLine[4]))
                        ListE.append(float(ListLine[5]))
                        ListF.append(float(ListLine[6]))
            else:
                ListA.append(0)
                ListB.append(1)
                ListC.append(0)
                ListD.append(0)
                ListE.append(0)
                ListF.append(1)
        ListConvName.append('NO_COMPU_METHOD')
        ListUnit.append('')
        ListA.append(0)
        ListB.append(1)
        ListC.append(0)
        ListD.append(0)
        ListE.append(0)
        ListF.append(1)
        #将数据类型name跟公式信息2个list添加到dataframe
        DFConvInfo['Name'] = ListConvName
        DFConvInfo['Unit'] = ListUnit
        DFConvInfo['A'] = ListA
        DFConvInfo['B'] = ListB
        DFConvInfo['C'] = ListC
        DFConvInfo['D'] = ListD
        DFConvInfo['E'] = ListE
        DFConvInfo['F'] = ListF

        return DFConvInfo
Esempio n. 5
0
def write2csv_tpot(X, y, outfName, feat_list):
    dat = np_hstack((X, y[:, None]))
    df = pd_DataFrame(dat)
    df.to_csv(path_or_buf=outfName, index=False, header=feat_list)
Esempio n. 6
0
def _get_xbrl_datas(xbrl_file, xbrl_file_data):
    """データ取得"""

    # xbrlファイル読み込み
    if RE_XBRL_P_V1_MATCH(os_basename(xbrl_file)):
        # 旧 EDINET XBRL
        # print(xbrl_file)
        xbrl = xbrl_jpfr_Parser(xbrl_file, xbrl_file_data)
        xbrl_ver = 1
    elif RE_XBRL_P_V2_MATCH(os_basename(xbrl_file)):
        # print(xbrl_file)
        xbrl = xbrl_jpcor_Parser(xbrl_file, xbrl_file_data)
        xbrl_ver = 2
    else:
        # 監査報告書のXBRLが該当(jpaud-***.xbrl)
        # print('未対応のファイル名 %s' % xbrl_file)
        return None

    # データをリストに変換
    data_labels = [
        'version',
        '提出日',
        '提出回数',
        '報告対象期間期末日',
        '追番',
        '第N期',
        '名前空間接頭辞',
        'tag',
        'id',
        'context',
        '開始日',
        '終了日',
        '期末日',
        '連結',
        '値',
    ]

    context_tags = xbrl.context_tags

    xbrl_infos = [
        xbrl_ver,
        xbrl.info['提出日'],
        xbrl.info['提出回数'],
        xbrl.info['報告対象期間期末日'],
        xbrl.info['追番'],
        xbrl.info['第N期'],
    ]

    datas = []
    datas_append = datas.append

    xbrl_standard = xbrl.info['会計基準'] if '会計基準' in xbrl.info else None

    # xbrl.xbrl_datasの種類(namespaceに対応する接頭辞)
    # 管理情報(jpfr-di, ifrs, jpdei_cor)
    # 表紙・サマリ・本文など(jpcrp_cor)
    # 財務諸表(jpfr-t-***, ifrs, jppfs_cor)
    # 提出者別タクソノミ(*E00000*)
    for (namespace, xbrl_data) in xbrl.xbrl_datas:

        # キーのタプル(タグ名・コンテキスト・ID)
        # 値の辞書(属性・テキスト)
        for ((t_tag, t_context_ref, t_id), v) in xbrl_data.items():

            # タグ名から名前空間を分離 & 接頭辞に変換
            (t_ns, t_tag_name) = t_tag.rsplit('}', maxsplit=1)
            try:
                datas_append(
                    # XBRLバージョンと文書情報
                    xbrl_infos +

                    # 名前空間接頭辞 タグ名 id属性 コンテキスト
                    [
                        xbrl.ns_prefixes[t_ns.lstrip('{')],
                        t_tag_name,
                        t_id,
                        t_context_ref,
                    ] +

                    # 開始日 終了日 期末日
                    _get_dates(context_tags[t_context_ref]['period']) +

                    # 連結区分 型変換した値
                    [
                        _get_consolidated_or_nonconsolidated(
                            context_tags[t_context_ref], xbrl_ver,
                            xbrl_standard),
                        conv_str_to_num(v['text']),
                    ])
            except:
                print(format_exc())
    del (xbrl, xbrl_infos, context_tags)

    # データフレームに変換
    df = pd_DataFrame(datas, columns=data_labels)
    del (datas, data_labels)

    def df_conv_str_to_datetime(t_colulmn_name):
        """文字列 -> 日付変換"""
        try:
            df[t_colulmn_name] = pd_to_datetime(df[t_colulmn_name])
        except (TypeError, ValueError):
            print('変換エラー %s conv_str_to_num で再試行' % t_colulmn_name)
            df[t_colulmn_name] = df[t_colulmn_name].apply(conv_str_to_num)
        return

    for colulmn_name in ('提出日', '開始日', '終了日', '期末日'):
        df_conv_str_to_datetime(colulmn_name)

    return df
Esempio n. 7
0
def esearch_disease(DISEASE_LIST, OUTDIR):

    CREATE_DIR(OUTDIR)

    DISEASE_DIC = MAKE_DICIONARY(DISEASE_LIST)

    # data frame to store all Counts
    # +2 for one extra line for "COUNTS" and "TOTAL1"
    df = pd_DataFrame(index=range(0,
                                  len(DISEASE_DIC) + 2),
                      columns=range(0, 8))
    df.columns=["disease","COD","QUERY1","QUERY2","QUERY3","QUERY4",\
                "QUERY5","TOTAL2"]
    COL1 = list(DISEASE_DIC)
    COL1.append('COUNTS')
    COL1.append('TOTAL1')
    df['disease'] = COL1

    # data frame to store all the commands used for each search
    COMMAND = pd_DataFrame(index=range(0, len(DISEASE_DIC)),
                           columns=range(0, 8))
    COMMAND.columns=["disease","COD","QUERY1","QUERY2","QUERY3","QUERY4",\
                     "QUERY5","END"]
    COMMAND["disease"] = COL1[0:len(DISEASE_DIC)]
    COMMAND["END"] = '.'

    # data frameto store the queries' explanations
    QUERY_description = pd_DataFrame(index=range(0, 5), columns=range(0, 1))
    QUERY_description.columns = ["DESCRIPTION"]
    QUERY_description.index = [
        "QUERY1", "QUERY2", "QUERY3", "QUERY4", "QUERY5"
    ]
    QUERY1_desc='Procura o nome da doença em todos os campos e filtra por'\
                ' experimentos de expressão gênica feitos com amostras '\
                'humanas. Essa é a QUERY mais abrangente.'
    QUERY2_desc='Igual a QUERY1 só que também procura por "patient" OU '\
                '"patients" em todos os campos'
    QUERY3_desc='Igual a QUERY2 só que também filtra por bioprojects '\
                'presentes na base de dados SRA'
    QUERY4_desc='Procura o nome da doença somente no título do bioproject, '\
                'procura por "patient" OU "patients" em todos os campos e '\
                'filtra por experimentos de expressão gênica feitos com '\
                'amostras humanas'
    QUERY5_desc='Igual a QUERY4 só que também filtra por bioprojects '\
                'presentes na base de dados SRA'
    QUERY_description["DESCRIPTION"]=[QUERY1_desc,QUERY2_desc,QUERY3_desc,\
                                      QUERY4_desc,QUERY5_desc]

    IdList_QUERY1 = []
    IdList_QUERY2 = []
    IdList_QUERY3 = []
    IdList_QUERY4 = []
    IdList_QUERY5 = []
    IdList_total = []

    N = 0
    for DISEASE in list(DISEASE_DIC):

        print(str(N) + '\t' + DISEASE)

        COD = DISEASE_DIC[DISEASE]
        df["COD"][N] = COD
        COMMAND["COD"][N] = COD

        QUERY_DIC={'1':'("'+DISEASE+'"[All Fields])AND'\
                        '("transcriptome gene expression"[Filter]AND"org '\
                        'human"[Filter])',
                   '2':'("'+DISEASE+'"[All Fields]AND'\
                        '("patient"[All Fields]OR"patients"[All Fields])AND'\
                        '("transcriptome gene expression"[Filter]AND"org '\
                        'human"[Filter])',
                   '3':'("'+DISEASE+'"[All Fields]AND'\
                        '("patient"[All Fields]OR"patients"[All Fields])AND'\
                        '("transcriptome gene expression"[Filter]AND"org '\
                        'human"[Filter]AND"bioproject sra"[Filter])',
                   '4':'("'+DISEASE+'"[Title]AND'\
                        '("patient"[All Fields]OR"patients"[All Fields])AND'\
                        '("transcriptome gene expression"[Filter]AND"org '\
                        'human"[Filter])',
                   '5':'("'+DISEASE+'"[Title]AND'\
                        '("patient"[All Fields]OR"patients"[All Fields])AND'\
                        '("transcriptome gene expression"[Filter]AND"org '\
                        'human"[Filter])AND"bioproject sra"[Filter])'}

        Idlist_disease = []

        ROUND = ['1', '2', '3', '4', '5']
        for R in ROUND:
            QUERY = 'QUERY' + R
            TERM = QUERY_DIC[R]
            #            COMMAND[locals[QUERY]][N]=TERM

            handle = Entrez.esearch(db="bioproject", retmax=1000, term=TERM)
            record = Entrez.read(handle)
            handle.close()
            if int(record["Count"]) > 1000:
                print('\nATTENTION!\nn'+record["Count"]+' bioprojects are '\
                      'related to this esearch and only 1000 will be written '\
                      'to the Idlist for the further analysis.\n\n'+QUERY+\
                      'for '+DISEASE+'\n\n'+QUERY_DIC[R]+'\n')
                exit

            # MONTAR LISTA POR DOENÇA
            Idlist_disease += list(record["IdList"])
            IdList_total += list(record["IdList"])

            # ADD IDS TO QUERY AND TOTAL LISTS
            #            IdList_total+=record["IdList"]
            if R == '1':
                IdList_QUERY1 += list(record["IdList"])
                COMMAND['QUERY1'][N] = TERM
                df['QUERY1'][N] = int(record["Count"])
            elif R == '2':
                IdList_QUERY2 += list(record["IdList"])
                COMMAND['QUERY2'][N] = TERM
                df['QUERY2'][N] = int(record["Count"])
            elif R == '3':
                IdList_QUERY3 += list(record["IdList"])
                COMMAND['QUERY3'][N] = TERM
                df['QUERY3'][N] = int(record["Count"])
            elif R == '4':
                IdList_QUERY4 += list(record["IdList"])
                COMMAND['QUERY4'][N] = TERM
                df['QUERY4'][N] = int(record["Count"])
            elif R == '5':
                IdList_QUERY5 += list(record["IdList"])
                COMMAND['QUERY5'][N] = TERM
                df['QUERY5'][N] = int(record["Count"])

        #remove replicates from the list
        Idlist_disease = list(set(Idlist_disease))

        df['TOTAL2'][N] = len(Idlist_disease)

        outfile = pathjoin(OUTDIR, "IdListDIR/disease", COD + ".txt")
        with open(outfile, 'w') as f:
            print("\n".join(Idlist_disease), file=f)
            f.close()

        N += 1

    #preencher a linha com totais
    for COL in list(df)[2:len(df)]:  #COL da terceira coluna até a última
        df[COL][len(DISEASE_DIC)] = df[COL][0:len(DISEASE_DIC)].sum(axis=0)

    # ESCREVER DEMAIS LISTAS PARA ARQUIVOS TXT
    IdList_total = list(set(IdList_total))
    outfile = pathjoin(OUTDIR, "IdListDIR/IdList_total.txt")
    with open(outfile, 'w') as f:
        print("\n".join(IdList_total), file=f)
        f.close()

    IdList_QUERY1 = list(set(IdList_QUERY1))
    df.loc[len(DISEASE_DIC) + 1, "QUERY1"] = len(IdList_QUERY1)
    outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY1.txt")
    with open(outfile, 'w') as f:
        print("\n".join(IdList_QUERY1), file=f)
        f.close()

    IdList_QUERY2 = list(set(IdList_QUERY2))
    df.loc[len(DISEASE_DIC) + 1, "QUERY2"] = len(IdList_QUERY2)
    outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY2.txt")
    with open(outfile, 'w') as f:
        print("\n".join(IdList_QUERY2), file=f)
        f.close()

    IdList_QUERY3 = list(set(IdList_QUERY3))
    df.loc[len(DISEASE_DIC) + 1, "QUERY3"] = len(IdList_QUERY3)
    outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY3.txt")
    with open(outfile, 'w') as f:
        print("\n".join(IdList_QUERY3), file=f)
        f.close()

    IdList_QUERY4 = list(set(IdList_QUERY4))
    df.loc[len(DISEASE_DIC) + 1, "QUERY4"] = len(IdList_QUERY4)
    outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY4.txt")
    with open(outfile, 'w') as f:
        print("\n".join(IdList_QUERY4), file=f)
        f.close()

    IdList_QUERY5 = list(set(IdList_QUERY5))
    df.loc[len(DISEASE_DIC) + 1, "QUERY5"] = len(IdList_QUERY5)
    outfile = pathjoin(OUTDIR, "IdListDIR/query", "IdList_QUERY5.txt")
    with open(outfile, 'w') as f:
        print("\n".join(IdList_QUERY5), file=f)
        f.close()

    #ESCREVER TODOS OS RESULTADOS PARA UM ARQUIVO EXCEL
    writer = pd_ExcelWriter(pathjoin(OUTDIR, 'search_NCBI_RESULT.xlsx'),
                            engine='xlsxwriter')
    df.to_excel(writer, sheet_name='counts')
    COMMAND.to_excel(writer, sheet_name='command_lines')
    QUERY_description.to_excel(writer, sheet_name='query_description')
    writer.save()

    return (pathjoin(osgetcwd(), OUTDIR))
Esempio n. 8
0
def efetch_found_bioprojects(OUTDIR):


    def printProgressBar (iteration, total, prefix = '', suffix = '', \
                      decimals = 1, length = 100, fill = '█'):
        """
        Call in a loop to create terminal progress bar
        @params:
            iteration   - Required  : current iteration (Int)
            total       - Required  : total iterations (Int)
            prefix      - Optional  : prefix string (Str)
            suffix      - Optional  : suffix string (Str)
            decimals    - Optional  : positive number of decimals in percent \
                                      complete (Int)
                                      length      - Optional  : character length of bar (Int)
                                      fill        - Optional  : bar fill character (Str)
        """
        percent = ("{0:." + str(decimals) + "f}")\
                                    .format(100 * (iteration / float(total)))
        filledLength = int(length * iteration // total)
        bar = fill * filledLength + '-' * (length - filledLength)
        print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
        # Print New Line on Complete
        if iteration == total:
            print()

    """
    COLETAR INFORMAÇOES SOBRE BIOPROJECTS ECONTRADOS
    """

    if pathexists(OUTDIR):

        for DIR in ['Bioprojects', 'Bioprojects/xml']:
            if not pathexists(pathjoin(OUTDIR, DIR)):
                osmkdir(pathjoin(OUTDIR, DIR))

        path_to_list = pathjoin(OUTDIR, 'IdListDIR/IdList_total.txt')
        if isfile(path_to_list):
            with open(path_to_list, 'r') as f:
                IdList_total = list(filter(None, f.read().splitlines()))
        else:
            print('File '+f+' was not found. Run esearch_disease(OUTDIR) '\
                  'for making it.')
            exit()
    else:
        print('Directory '+pathjoin(OUTDIR)+' is not accessible. Did you run'\
              'esearch_disease() previously? If not, do it and try again.')
        exit()

    df2 = pd_DataFrame(index=range(0, len(IdList_total)), columns=range(0, 7))
    df2.columns = [
        "ID", "accession", "GEO", "title", "abstract", "disease", "COD"
    ]
    df2["ID"] = IdList_total

    print("\n\n")  # ESSE PRINT SERVE PARA DISTANCIAR A BARRA DE PROCESSAMENTO
    # QUE VEM LOGO ABAIXO DENTRO DO LOOPING

    # prepare bar progress
    l = len(IdList_total)
    i = 0
    printProgressBar(0, l, prefix='Download:', suffix='Complete', length=50)

    RECALL = []  # if download fails, the ID is stored in RECALL
    DIC_ID = {}
    for ID in IdList_total:

        try:
            handle = Entrez.efetch(db="bioproject", id=ID)
        except:
            RECALL += [ID]
            print('handle = Entrez.efetch(db="bioproject", id=' + ID +
                  ')\tFAILED')
            continue  # avoid catastrophic event in case NCBI fails to give
            # the informatio for one ID
        try:
            record = handle.read()
            root = ET.fromstring(record)
            DIC = root.find(".//ProjectID/ArchiveID").attrib
            DIC_ID[DIC['accession']] = DIC_ID.get(DIC['accession'], DIC['id'])

            outfile=pathjoin(OUTDIR,'Bioprojects/xml',DIC['accession']+\
                                 '_'+DIC['id']+'.xml')

            #print(outfile)
            with open(outfile, "w", encoding="utf-8") as f:
                print(record, file=f)

        except:
            RECALL += [ID]
            print('FAILED to process ' + ID + ' during the first trial')
            continue

        printProgressBar(i + 1,
                         l,
                         prefix='Download:',
                         suffix='Complete',
                         length=50)
        i += 1

    # RECALL for failure IDs
    if len(RECALL) > 0:
        print("\n\nFailure to download IDs. STARTING RECALL.")

        l = len(RECALL)
        i = 0
        printProgressBar(0,
                         l,
                         prefix='Download:',
                         suffix='Complete',
                         length=50)
        RECALL2 = []
        for ID in RECALL:

            try:
                handle = Entrez.efetch(db="bioproject", id=ID)
            except:
                RECALL2 += [ID]
                print('handle = Entrez.efetch(db="bioproject", id='+ID+')'\
                      '\tFAILED in RECALL')
                continue
            try:
                record = handle.read()
                root = ET.fromstring(record)
                DIC = root.find(".//ProjectID/ArchiveID").attrib
                DIC_ID[DIC['accession']] = DIC_ID.get(DIC['accession'],
                                                      DIC['id'])

                outfile=pathjoin(OUTDIR,'Bioprojects/xml',DIC['accession']+\
                                     '_'+DIC['id']+'.xml')

                #print(outfile)
                with open(outfile, "w", encoding="utf-8") as f:
                    print(record, file=f)
            except:
                RECALL2 += [ID]
                print('FAILED to process ' + ID + ' during the RECALL')
                continue

            printProgressBar(i + 1,
                             l,
                             prefix='RECALL:',
                             suffix='Complete',
                             length=50)
            i += 1

        if len(RECALL2) > 0:
            outfile = pathjoin(OUTDIR, 'Bioprojects/', 'RECALL_failure.txt')
            open(outfile, 'w').write(str(RECALL2))
            print("It was not possible to get ID even during the RECALL\nYou"\
                  "can find the problematic IDs on file:\n"+outfile)

    outfile = pathjoin(OUTDIR, 'Bioprojects/', 'dict_ID_ACC.txt')
    open(outfile, 'w').write(str(DIC_ID))
Esempio n. 9
0
    def eventstudy(self,
                   data=None,
                   model='m',
                   estwin=100,
                   gap=50,
                   evtwins=-10,
                   evtwine=10,
                   minval=70,
                   output='df'):
        """
            Paramaters passed to the event study method.

            data        =   event data (event date & permno combinations)
            model       =   madj (market-adjusted model)
                            m (market model)
                            ff (fama french)
                            ffm (fama french with momentum factor)
            estwin      =   estimation window
            gap         =   gap between estimation window and event window
            evtwins =   days preceding event date to begin event window
            evtwine =   days after event date to close the event window
            minval      =   minimum number of non-missing return observations (per event) to be regressed on
            output      =   output format of the event study results
                            xls (output an excel file to output path)
                            csv (output a csv file to output path)
                            json (output a json file to output path)
                            df (returns a dictionary of pandas dataframes)
                            print (outputs results to the console - not available via qsub)
        """

        ####################################################################################
        #  STEP 1 - SET ESTIMATION, EVENT, AND GAP WINDOWS AND GRAB DATA FROM EVENTS FILE  #
        ####################################################################################

        estwins = (estwin + gap + np_abs(evtwins))  # Estimation window start
        estwine = (gap + np_abs(evtwins) + 1)  # Estimation window end
        evtwinx = (
            estwins + 1
        )  # evt time value (0=event date, -10=window start, 10=window end)
        evtwins = np_abs(
            evtwins
        )  # convert the negative to positive as we will use lag function)
        evtrang = (evtwins + evtwine + 1
                   )  # total event window days (lag + lead + the day itself)
        """
            With the event date as a fixed point, calculate the number of days needed to pass
            to sql lag and lead functions to identify estimation window, gap, and event window.

            evtwins:    event date minus number of preceding days
                        ("event date" - "number of days before event to start [evtwins parameter]")

            evtwine:    event date plus number of following days
                        ("event date" + "number of days after event to end [evtwine parameter]")

            gap:    number of days between the end of the "estimation window"
                    and the beginning of the "event window"

            estwins:     start date of the estimation window
                        ("event date" - "number of days before event to start [evtwins parameter]"
                                      - "number of days in gap [gap parameter]"
                                      - "number of days in estimation window [estwin parameter]")

            evtrang:    entire time range of the event study even from estimate start, through gap,
                        until event window end
                        (evtwins + evtwine + 1)
        """

        # default the event data in case it was not passed, otherwise read what was passed
        evtdata = [{"edate": "05/29/2012", "permno": "10002"}]
        if data is not None:
            evtdata = json_dumps(data)

        # init values wrapped up to be passed to sql statement
        params = {
            'estwins': estwins,
            'estwine': estwine,
            'evtwins': evtwins,
            'evtwine': evtwine,
            'evtwinx': evtwinx,
            'evtdata': evtdata
        }

        #############################################
        #  STEP 2 - GET RETURNS DATA FROM POSTGRES  #
        #############################################

        # Create a database connection
        wconn = self.connect()

        ##############################################################################
        #  Get the initial data from the database and put it in a pandas dataframe   #
        ##############################################################################

        # create a pandas dataframe that will hold data
        df = wconn.raw_sql("""
        SELECT
                a.*,
                x.*,
                c.date as rdate,
                c.ret as ret1,
                (f.mktrf+f.rf) as mkt,
                f.mktrf,
                f.rf,
                f.smb,
                f.hml,
                f.umd,
                (1+c.ret)*(coalesce(d.dlret,0.00)+1)-1-(f.mktrf+f.rf) as exret,
                (1+c.ret)*(coalesce(d.dlret,0.00)+1)-1 as ret,
                case when c.date between a.estwin1 and a.estwin2 then 1 else 0 end as isest,
                case when c.date between a.evtwin1 and a.evtwin2 then 1 else 0 end as isevt,
                case
                  when c.date between a.evtwin1 and a.evtwin2 then (rank() OVER (PARTITION BY x.evtid ORDER BY c.date)-%(evtwinx)s)
                  else (rank() OVER (PARTITION BY x.evtid ORDER BY c.date))
                end as evttime
        FROM
          (
            SELECT
              date,
              lag(date, %(estwins)s ) over (order by date) as estwin1,
              lag(date, %(estwine)s )  over (order by date) as estwin2,
              lag(date, %(evtwins)s )  over (order by date) as evtwin1,
              lead(date, %(evtwine)s )  over (order by date) as evtwin2
            FROM crsp_a_stock.dsi
          ) as a
        JOIN
        (select
                to_char(x.edate, 'ddMONYYYY') || trim(to_char(x.permno,'999999999')) as evtid,
                x.permno,
                x.edate
        from
        json_to_recordset('%(evtdata)s') as x(edate date, permno int)
        ) as x
          ON a.date=x.edate
        JOIN crsp_a_stock.dsf c
            ON x.permno=c.permno
            AND c.date BETWEEN a.estwin1 and a.evtwin2
        JOIN ff_all.factors_daily f
            ON c.date=f.date
        LEFT JOIN crsp_a_stock.dsedelist d
            ON x.permno=d.permno
            AND c.date=d.dlstdt
        WHERE f.mktrf is not null
        AND c.ret is not null
        ORDER BY x.evtid, x.permno, a.date, c.date
        """ % params)

        # Columns coming from the database query
        df.columns = [
            'date', 'estwin1', 'estwin2', 'evtwin1', 'evtwin2', 'evtid',
            'permno', 'edate', 'rdate', 'ret1', 'mkt', 'mktrf', 'rf', 'smb',
            'hml', 'umd', 'exret', 'ret', 'isest', 'isevt', 'evttime'
        ]

        # Additional columns that will hold computed values (post-query)
        addcols = [
            'RMSE', 'INTERCEPT', 'var_estp', 'expret', 'abret', 'alpha',
            '_nobs', '_p_', '_edf_', 'rsq', 'cret', 'cexpret', 'car', 'scar',
            'sar', 'pat_scale', 'bhar', 'lastevtwin', 'cret_edate',
            'scar_edate', 'car_edate', 'bhar_edate', 'pat_scale_edate', 'xyz'
        ]

        # Add them to the dataframe
        for c in addcols:
            if c == 'lastevtwin':
                df[c] = 0
            else:
                df[c] = np_nan

        ###################################################################################
        #  STEP 3 - FOR EACH EVENT, CALCULATE ABNORMAL RETURN BASED ON CHOSEN RISK MODEL  #
        ###################################################################################

        # Loop on every category
        for evt in data:

            permno = evt['permno']
            xdate = evt['edate']
            edate = datetime.strptime(xdate, "%m/%d/%Y").date()

            est_mask = (df['permno'] == permno) & (df['edate'] == edate) & (
                df['isest'] == 1)
            evt_mask = (df['permno'] == permno) & (df['edate'] == edate) & (
                df['isevt'] == 1)

            #######################################################
            #  Check to see it meets the min obs for est window   #
            #######################################################
            _nobs = df["ret"][est_mask].count()

            # Only carry out the analysis if the number of obsevations meets the minimum threshold
            if _nobs >= minval:

                #######################################################
                #  Regression based on model choices=''               #
                #######################################################

                # Market-Adjusted Model
                if model == 'madj':
                    # Set y to the estimation window records
                    y = df["exret"][est_mask]

                    # Calculate mean and standard deviation of returns for the estimation period
                    mean = np_mean(y)
                    stdv = np_std(y, ddof=1)

                    # Update the columns in the original dataframe (reusing the names from SAS code to help with continuity)
                    df.loc[evt_mask, 'INTERCEPT'] = mean
                    df.loc[evt_mask, 'RMSE'] = stdv
                    df.loc[evt_mask, '_nobs'] = len(y)
                    df.loc[evt_mask, 'var_estp'] = stdv**2
                    df.loc[evt_mask, 'alpha'] = mean
                    df.loc[evt_mask, 'rsq'] = 0
                    df.loc[evt_mask, '_p_'] = 1
                    df.loc[evt_mask, '_edf_'] = (len(y) - 1)
                    df.loc[evt_mask, 'expret'] = df.loc[evt_mask, 'mkt']
                    df.loc[evt_mask, 'abret'] = df.loc[evt_mask, 'exret']
                    df_est = df[est_mask]
                    _nobs = len(df_est[df_est.ret.notnull()])

                    nloc = {'const': 0}

                    def f_cret(row):
                        tmp = ((row['ret'] * nloc['const']) +
                               (row['ret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret,
                                                                  axis=1)
                    df.loc[evt_mask, 'cret_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_cexpret(row):
                        tmp = ((row['expret'] * nloc['const']) +
                               (row['expret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret,
                                                                     axis=1)

                    nloc = {'const': 0}

                    def f_car(row):
                        tmp = (row['abret'] + nloc['const'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1)
                    df.loc[evt_mask, 'car_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_sar(row):
                        tmp = (row['abret'] / np_sqrt(row['var_estp']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1)
                    df.loc[evt_mask, 'sar_edate'] = nloc['const']

                    nloc = {'const': 0, 'evtrang': evtrang}

                    def f_scar(row):
                        tmp = (row['car'] / np_sqrt(
                            (evtrang * row['var_estp'])))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar,
                                                                  axis=1)
                    df.loc[evt_mask, 'scar_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_bhar(row):
                        tmp = (row['cret'] - row['cexpret'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar,
                                                                  axis=1)
                    df.loc[evt_mask, 'bhar_edate'] = nloc['const']

                    df.loc[evt_mask,
                           'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00)
                    df.loc[evt_mask,
                           'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00)

                # Market Model
                elif model == 'm':
                    # Set y to the estimation window records
                    X = df["mktrf"][est_mask]
                    y = df["ret"][est_mask]

                    # Fit an OLS model with intercept on mktrf
                    X = sm_add_constant(X)
                    est = sm_OLS(y, X).fit()

                    # Set the variables from the output
                    df_est = df[(df['permno'] == permno)
                                & (df['edate'] == edate) & (df['isest'] == 1)]
                    _nobs = len(
                        df_est[df_est.ret.notnull()])  # not null observations

                    # aggregate variables
                    # cret_edate = np_nan
                    # scar_edate = np_nan
                    # car_edate = np_nan
                    # bhar_edate = np_nan
                    # pat_scale_edate = np_nan
                    alpha = est.params.__getitem__('const')
                    beta1 = est.params.__getitem__('mktrf')

                    df.loc[evt_mask, 'INTERCEPT'] = alpha
                    df.loc[evt_mask, 'alpha'] = alpha
                    df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid)
                    df.loc[evt_mask, '_nobs'] = _nobs
                    df.loc[evt_mask, 'var_estp'] = est.mse_resid
                    df.loc[evt_mask, 'rsq'] = est.rsquared
                    df.loc[evt_mask, '_p_'] = 2
                    df.loc[evt_mask, '_edf_'] = (len(y) - 2)

                    nloc = {'alpha': alpha, 'beta1': beta1, 'const': 0}

                    def f_expret(row):
                        return (nloc['alpha'] + (nloc['beta1'] * row['mktrf']))

                    df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret,
                                                                    axis=1)

                    nloc = {'alpha': alpha, 'beta1': beta1, 'const': 0}

                    def f_abret(row):
                        return (row['ret'] - (nloc['alpha'] +
                                              (nloc['beta1'] * row['mktrf'])))

                    df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret,
                                                                   axis=1)

                    nloc = {'const': 0}

                    def f_cret(row):
                        tmp = ((row['ret'] * nloc['const']) +
                               (row['ret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret,
                                                                  axis=1)
                    df.loc[evt_mask, 'cret_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_cexpret(row):
                        tmp = ((row['expret'] * nloc['const']) +
                               (row['expret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret,
                                                                     axis=1)

                    nloc = {'const': 0}

                    def f_car(row):
                        # nonlocal const
                        tmp = (row['abret'] + nloc['const'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1)
                    df.loc[evt_mask, 'car_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_sar(row):
                        tmp = (row['abret'] / np_sqrt(row['var_estp']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1)
                    df.loc[evt_mask, 'sar_edate'] = nloc['const']

                    nloc = {'const': 0, 'evtrang': evtrang}

                    def f_scar(row):
                        tmp = (row['car'] / np_sqrt(
                            (evtrang * row['var_estp'])))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar,
                                                                  axis=1)
                    df.loc[evt_mask, 'scar_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_bhar(row):
                        tmp = (row['cret'] - row['cexpret'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar,
                                                                  axis=1)
                    df.loc[evt_mask, 'bhar_edate'] = nloc['const']

                    df.loc[evt_mask,
                           'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00)
                    df.loc[evt_mask,
                           'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00)

                # Fama-French Three Factor Model
                elif model == 'ff':
                    # Set y to the estimation window records
                    df_est = df[(df['permno'] == permno)
                                & (df['edate'] == edate) & (df['isest'] == 1)]
                    X = df_est[['smb', 'hml', 'mktrf']]
                    y = df_est['ret']

                    # Fit an OLS model with intercept on mktrf, smb, hml
                    X = sm_add_constant(X)
                    est = sm_OLS(y, X).fit()
                    # est = smf.ols(formula='ret ~ smb + hml + mktrf', data=df_est).fit()

                    alpha = est.params.__getitem__('const')
                    beta1 = est.params.__getitem__('mktrf')
                    beta2 = est.params.__getitem__('smb')
                    beta3 = est.params.__getitem__('hml')

                    df.loc[evt_mask, 'INTERCEPT'] = alpha
                    df.loc[evt_mask, 'alpha'] = alpha
                    df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid)
                    df.loc[evt_mask, '_nobs'] = _nobs
                    df.loc[evt_mask, 'var_estp'] = est.mse_resid
                    df.loc[evt_mask, 'rsq'] = est.rsquared
                    df.loc[evt_mask, '_p_'] = 2
                    df.loc[evt_mask, '_edf_'] = (len(y) - 2)

                    nloc = {
                        'alpha': alpha,
                        'beta1': beta1,
                        'beta2': beta2,
                        'beta3': beta3,
                        'const': 0
                    }

                    def f_expret(row):
                        return ((nloc['alpha'] +
                                 (nloc['beta1'] * row['mktrf']) +
                                 (nloc['beta2'] * row['smb']) +
                                 (nloc['beta3'] * row['hml'])))

                    df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret,
                                                                    axis=1)

                    nloc = {
                        'alpha': alpha,
                        'beta1': beta1,
                        'beta2': beta2,
                        'beta3': beta3,
                        'const': 0
                    }

                    def f_abret(row):
                        return (row['ret'] - ((nloc['alpha'] +
                                               (nloc['beta1'] * row['mktrf']) +
                                               (nloc['beta2'] * row['smb']) +
                                               (nloc['beta3'] * row['hml']))))

                    df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret,
                                                                   axis=1)

                    nloc = {'const': 0}

                    def f_cret(row):
                        tmp = ((row['ret'] * nloc['const']) +
                               (row['ret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret,
                                                                  axis=1)
                    df.loc[evt_mask, 'cret_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_cexpret(row):
                        tmp = ((row['expret'] * nloc['const']) +
                               (row['expret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret,
                                                                     axis=1)
                    nloc = {'const': 0}

                    def f_car(row):
                        tmp = (row['abret'] + nloc['const'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1)
                    df.loc[evt_mask, 'car_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_sar(row):
                        tmp = (row['abret'] / np_sqrt(row['var_estp']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1)
                    df.loc[evt_mask, 'sar_edate'] = nloc['const']

                    nloc = {'const': 0, 'evtrang': evtrang}

                    def f_scar(row):
                        tmp = (row['car'] / np_sqrt(
                            (evtrang * row['var_estp'])))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar,
                                                                  axis=1)
                    df.loc[evt_mask, 'scar_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_bhar(row):
                        tmp = (row['cret'] - row['cexpret'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar,
                                                                  axis=1)
                    df.loc[evt_mask, 'bhar_edate'] = nloc['const']

                    df.loc[evt_mask,
                           'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00)
                    df.loc[evt_mask,
                           'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00)

                # Fama-French Plus Momentum
                elif model == 'ffm':
                    # Set y to the estimation window records
                    df_est = df[(df['permno'] == permno)
                                & (df['edate'] == edate) & (df['isest'] == 1)]

                    X = df_est[['mktrf', 'smb', 'hml',
                                'umd']]  # indicator variables
                    y = df_est['ret']  # response variables

                    # Fit an OLS (ordinary least squares) model with intercept on mktrf, smb, hml, and umd
                    X = sm_add_constant(X)
                    est = sm_OLS(y, X).fit()

                    alpha = est.params.__getitem__('const')
                    beta1 = est.params.__getitem__('mktrf')
                    beta2 = est.params.__getitem__('smb')
                    beta3 = est.params.__getitem__('hml')
                    beta4 = est.params.__getitem__('umd')

                    df.loc[evt_mask, 'INTERCEPT'] = alpha
                    df.loc[evt_mask, 'alpha'] = alpha
                    df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid)
                    df.loc[evt_mask, '_nobs'] = _nobs
                    df.loc[evt_mask, 'var_estp'] = est.mse_resid
                    df.loc[evt_mask, 'rsq'] = est.rsquared
                    df.loc[evt_mask, '_p_'] = 2
                    df.loc[evt_mask, '_edf_'] = (len(y) - 2)

                    nloc = {
                        'alpha': alpha,
                        'beta1': beta1,
                        'beta2': beta2,
                        'beta3': beta3,
                        'beta4': beta4,
                        'const': 0
                    }

                    def f_expret(row):
                        return ((nloc['alpha'] +
                                 (nloc['beta1'] * row['mktrf']) +
                                 (nloc['beta2'] * row['smb']) +
                                 (nloc['beta3'] * row['hml']) +
                                 (nloc['beta4'] * row['umd'])))

                    df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret,
                                                                    axis=1)

                    nloc = {
                        'alpha': alpha,
                        'beta1': beta1,
                        'beta2': beta2,
                        'beta3': beta3,
                        'beta4': beta4,
                        'const': 0
                    }

                    def f_abret(row):
                        return (row['ret'] - ((nloc['alpha'] +
                                               (nloc['beta1'] * row['mktrf']) +
                                               (nloc['beta2'] * row['smb']) +
                                               (nloc['beta3'] * row['hml']) +
                                               (nloc['beta4'] * row['umd']))))

                    df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret,
                                                                   axis=1)

                    nloc = {'const': 0}

                    def f_cret(row):
                        tmp = ((row['ret'] * nloc['const']) +
                               (row['ret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret,
                                                                  axis=1)
                    df.loc[evt_mask, 'cret_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_cexpret(row):
                        tmp = ((row['expret'] * nloc['const']) +
                               (row['expret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret,
                                                                     axis=1)
                    nloc = {'const': 0}

                    def f_car(row):
                        tmp = (row['abret'] + nloc['const'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1)
                    df.loc[evt_mask, 'car_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_sar(row):
                        tmp = (row['abret'] / np_sqrt(row['var_estp']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1)
                    df.loc[evt_mask, 'sar_edate'] = nloc['const']

                    nloc = {'const': 0, 'evtrang': evtrang}

                    def f_scar(row):
                        tmp = (row['car'] / np_sqrt(
                            (evtrang * row['var_estp'])))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar,
                                                                  axis=1)
                    df.loc[evt_mask, 'scar_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_bhar(row):
                        tmp = (row['cret'] - row['cexpret'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar,
                                                                  axis=1)
                    df.loc[evt_mask, 'bhar_edate'] = nloc['const']

                    df.loc[evt_mask,
                           'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00)
                    df.loc[evt_mask,
                           'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00)
                # Something erroneous was passed
                else:
                    df['isest'][evt_mask] = -2

        #################################
        #  STEP 4 - OUTPUT THE RESULTS  #
        #################################
        df_sta = df[df['isevt'] == 1]
        levt = df_sta['evttime'].unique()

        columns = [
            'evttime', 'car_m', 'ret_m', 'abret_m', 'abret_t', 'sar_t',
            'pat_ar', 'cret_edate_m', 'car_edate_m', 'pat_car_edate_m',
            'car_edate_t', 'scar_edate_t', 'bhar_edate_m'
        ]

        idxlist = list(levt)
        df_stats = pd_DataFrame(index=idxlist, columns=columns)
        df_stats = df_stats.fillna(0.00000000)  # with 0s rather than NaNs

        # Event
        df_stats['evttime'] = df_sta.groupby(['evttime'])['evttime'].unique()
        # Means
        df_stats['abret_m'] = df_sta.groupby(['evttime'])['abret'].mean()
        df_stats['bhar_edate_m'] = df_sta.groupby(['evttime'
                                                   ])['bhar_edate'].mean()
        df_stats['car_edate_m'] = df_sta.groupby(['evttime'
                                                  ])['car_edate'].mean()
        df_stats['car_m'] = df_sta.groupby(['evttime'])['car'].mean()
        df_stats['cret_edate_m'] = df_sta.groupby(['evttime'
                                                   ])['cret_edate'].mean()
        df_stats['pat_scale_m'] = df_sta.groupby(['evttime'
                                                  ])['pat_scale'].mean()
        df_stats['pat_car_edate_mean'] = 0
        df_stats['ret_m'] = df_sta.groupby(['evttime'])['ret'].mean()
        df_stats['sar_m'] = df_sta.groupby(['evttime'])['sar'].mean()
        df_stats['scar_edate_m'] = df_sta.groupby(['evttime'
                                                   ])['scar_edate'].mean()
        df_stats['scar_m'] = df_sta.groupby(['evttime'])['scar'].mean()
        # Standard deviations
        df_stats['car_v'] = df_sta.groupby(['evttime'])['car'].std()
        df_stats['abret_v'] = df_sta.groupby(['evttime'])['abret'].std()
        df_stats['sar_v'] = df_sta.groupby(['evttime'])['sar'].std()
        df_stats['pat_scale_v'] = df_sta.groupby(['evttime'
                                                  ])['pat_scale'].std()
        df_stats['car_edate_v'] = df_sta.groupby(['evttime'
                                                  ])['car_edate'].std()
        df_stats['scar_edate_v'] = df_sta.groupby(['evttime'
                                                   ])['scar_edate'].std()
        df_stats['scar_v'] = df_sta.groupby(['evttime'])['scar'].std()
        # Counts
        df_stats['scar_n'] = df_sta.groupby(['evttime'])['scar'].count()
        df_stats['scar_edate_n'] = df_sta.groupby(['evttime'
                                                   ])['scar_edate'].count()
        df_stats['sar_n'] = df_sta.groupby(['evttime'])['sar'].count()
        df_stats['car_n'] = df_sta.groupby(['evttime'])['car'].count()
        df_stats['n'] = df_sta.groupby(['evttime'])['evttime'].count()
        # Sums
        df_stats['pat_scale_edate_s'] = df_sta.groupby(
            ['evttime'])['pat_scale_edate'].sum()
        df_stats['pat_scale_s'] = df_sta.groupby(['evttime'
                                                  ])['pat_scale'].sum()

        # T statistics 1
        def tstat(row, m, v, n):
            return row[m] / (row[v] / np_sqrt(row[n]))

        df_stats['abret_t'] = df_stats.apply(tstat,
                                             axis=1,
                                             args=('abret_m', 'abret_v', 'n'))
        df_stats['sar_t'] = df_stats.apply(tstat,
                                           axis=1,
                                           args=('sar_m', 'sar_v', 'n'))
        df_stats['car_edate_t'] = df_stats.apply(tstat,
                                                 axis=1,
                                                 args=('car_edate_m',
                                                       'car_edate_v', 'n'))
        df_stats['scar_edate_t'] = df_stats.apply(tstat,
                                                  axis=1,
                                                  args=('scar_edate_m',
                                                        'scar_edate_v',
                                                        'scar_edate_n'))

        # T statistics 2
        def tstat2(row, m, s, n):
            return row[m] / (np_sqrt(row[s]) / row[n])

        df_stats['pat_car'] = df_stats.apply(tstat2,
                                             axis=1,
                                             args=('scar_m', 'pat_scale_s',
                                                   'scar_n'))
        df_stats['pat_car_edate_m'] = df_stats.apply(tstat2,
                                                     axis=1,
                                                     args=('scar_edate_m',
                                                           'pat_scale_edate_s',
                                                           'scar_edate_n'))
        df_stats['pat_ar'] = df_stats.apply(tstat2,
                                            axis=1,
                                            args=('sar_m', 'pat_scale_s',
                                                  'sar_n'))

        # FILE 2
        # EVENT WINDOW
        df_evtw = df.ix[
            (df['isevt'] == 1),
            ['permno', 'edate', 'rdate', 'evttime', 'ret', 'abret']]
        df_evtw.sort_values(['permno', 'evttime'], ascending=[True, True])

        # FILE 1
        # EVENT DATE
        maxv = max(levt)
        df_evtd = df.ix[(df['isevt'] == 1) & (df['evttime'] == maxv),
                        ['permno', 'edate', 'cret', 'car', 'bhar']]
        df_evtd.sort_values(['permno', 'edate'], ascending=[True, True])

        if output == 'df':
            retval = {}
            retval['event_stats'] = df_stats
            retval['event_window'] = df_evtw
            retval['event_date'] = df_evtd
            return retval
        elif output == 'print':
            retval = {}
            print(
                tabulate(df_evtd.sort_values(['permno', 'edate'],
                                             ascending=[True, True]),
                         headers='keys',
                         tablefmt='psql'))
            print(tabulate(df_evtw, headers='keys', tablefmt='psql'))
            print(tabulate(df_stats, headers='keys', tablefmt='psql'))
            return retval
        elif output == 'json':
            retval = {}
            retval['event_stats'] = df_stats.to_dict(orient='split')
            retval['event_window'] = df_evtw.to_dict(orient='split')
            retval['event_date'] = df_evtd.to_dict(orient='split')
            # Write this to a file
            with open(os.path.join(self.output_path, 'EventStudy.json'),
                      'w') as outfile:
                json_dump(retval, outfile, cls=EncoderJson)
            # Return the output in case they are doing something programmatically
            return json_dumps(retval, cls=EncoderJson)
        elif output == 'csv':
            retval = ''
            es = StringIO_StringIO()
            df_stats.to_csv(es)
            retval += es.getvalue()
            ew = StringIO_StringIO()
            df_evtw.to_csv(ew)
            retval += "\r"
            retval += ew.getvalue()
            ed = StringIO_StringIO()
            df_evtd.to_csv(ed)
            retval += ed.getvalue()

            # write this to a file
            with open(os.path.join(self.output_path, 'EventStudy.csv'),
                      'w') as outfile:
                outfile.write(retval)

            # return the output in case they are doing something programmatically
            return retval
        elif output == 'xls':
            retval = {}
            xlswriter = pd_ExcelWriter(
                os.path.join(self.output_path, 'EventStudy.xls'))
            df_stats.to_excel(xlswriter, 'Stats')
            df_evtw.to_excel(xlswriter, 'Event Window')
            df_evtd.to_excel(xlswriter, 'Event Date')
            xlswriter.save()
            return retval
        else:
            pass
def analyze_AG_bipartite_network(genes,
                                 authors_GB_genes,
                                 pub_thresh=1,
                                 save_file_name="author_gene_bp.json",
                                 plot_flag=False):
    gene_list = genes.split(',')

    t0 = time.time()

    # unpickle groupby object
    #authors_GB_genes = pd.read_pickle(author_gene_GB_fname)
    authors_GB_genes = app.authors_GB_genes_loaded

    # get rid of invalid genes in gene_list
    new_gene_list = []
    for gene in gene_list:
        if gene in authors_GB_genes:
            new_gene_list.append(gene)

    gene_list = new_gene_list

    # create list of all authors/weights who have published on at least one gene in gene_list
    AW_list_total = []
    for gene in gene_list:
        AW_list_total.extend(list(authors_GB_genes[gene].index))
    AW_list_total = zip(*AW_list_total)

    author_list_total = AW_list_total[0]
    weight_list_total = AW_list_total[1]

    print(time.time() - t0)

    author_list_total = pd_Series(author_list_total)
    weight_list_total = pd_Series(weight_list_total, index=author_list_total)

    # take the mean of duplicate entries
    df_temp = pd_DataFrame(
        {
            'weight': list(weight_list_total),
            'author': list(author_list_total)
        },
        index=range(len(author_list_total)))
    AW_gb_temp = df_temp.weight.groupby(df_temp['author']).mean()

    author_list_total = list(AW_gb_temp.index)
    weight_list_total = list(AW_gb_temp.values)
    weight_list_total = pd_Series(weight_list_total, index=author_list_total)

    # make a dataframe, indexed by authors in author_list_total, with columns = entries in gene_list
    author_gene_df = pd_DataFrame(np.zeros(
        [len(author_list_total), len(gene_list)]),
                                  index=author_list_total,
                                  columns=gene_list)

    print(time.time() - t0)

    # fill in the dataframe
    for gene in gene_list:
        #print(gene)
        temp = list(authors_GB_genes[gene].index)
        temp = zip(*temp)
        authors_temp = list(np.unique(temp[0]))
        author_gene_df[gene][authors_temp] = weight_list_total[authors_temp]

    print(time.time() - t0)

    # add a column for total weight
    author_gene_df['total_weight'] = np.sum(np.array(author_gene_df), 1)

    author_gene_df.sort('total_weight', inplace=True, ascending=False)

    # next, convert this dataframe into bipartite network
    # make the small bipartite graph
    author_gene_bp = nx.Graph()

    # pick out authors which have published on > pub_thresh genes in gene_list
    index_temp = list(author_gene_df['total_weight'][
        author_gene_df['total_weight'] > pub_thresh].index)

    # only allow 200 authors max
    if len(index_temp) > 200:
        author_nodes = index_temp[0:200]
    else:
        author_nodes = index_temp
    #index_temp = list(author_gene_df['total_num'].index)
    #author_nodes = index_temp[0:num_authors]

    print(time.time() - t0)

    for gene in gene_list:
        for author in author_nodes:
            # only add a link if connection exists
            if author_gene_df[gene][author] > 0:
                author_gene_bp.add_edge(gene, author)

    # add all genes in gene_list in case none of them come up
    author_gene_bp.add_nodes_from(gene_list)

    # now apply clustering algo to the bipartite graph
    partition = community.best_partition(author_gene_bp)
    partition = pd_Series(partition)
    col_temp_authors = partition[author_nodes]
    col_temp_genes = partition[gene_list]
    col_temp = partition[author_gene_bp.nodes()]

    if plot_flag:
        # plot graph if plot_flag = True
        plt.figure(figsize=[15, 15])
        pos = nx.spring_layout(author_gene_bp, k=.3)
        #nx.draw(author_gene_bp,pos=pos,alpha=.5,node_size=100,node_color = col_temp,cmap='Paired')

        gene_list = list(gene_list)
        nx.draw_networkx_nodes(author_gene_bp,
                               nodelist=author_nodes,
                               node_color=col_temp_authors,
                               cmap='Paired',
                               pos=pos,
                               alpha=.5,
                               node_size=100)
        nx.draw_networkx_nodes(author_gene_bp,
                               nodelist=gene_list,
                               node_color=col_temp_genes,
                               cmap='Paired',
                               pos=pos,
                               alpha=.5,
                               node_size=200,
                               node_shape='s')
        nx.draw_networkx_edges(author_gene_bp, pos=pos, alpha=.1)
        node_subset_dict = dict(zip(index_temp[0:20], index_temp[0:20]))
        gene_subset_dict = dict(zip(gene_list, gene_list))
        temp = node_subset_dict.update(gene_subset_dict)
        nx.draw_networkx_labels(author_gene_bp,
                                pos=pos,
                                labels=node_subset_dict)

    # Set up json for saving
    # what should the colors be??
    num_communities = len(np.unique(col_temp))
    color_list = plt.cm.gist_rainbow(np.linspace(0, 1, num_communities))

    # blend the community colors (so that to-nodes are a mixture of all the communities they belong to)
    rfrac, gfrac, bfrac = calc_community_fraction(author_gene_bp, author_nodes,
                                                  gene_list, partition,
                                                  color_list)

    # save network in json format
    nodes = author_gene_bp.nodes()
    numnodes = len(nodes)
    edges = author_gene_bp.edges()
    numedges = len(edges)
    #nodes_dict = [{"id":n,"com":col_temp[n],"degree":author_gene_bp.degree(n)} for n in nodes]
    nodes_dict = [{
        "id": n,
        "com": col_temp[n],
        "degree": author_gene_bp.degree(n),
        "rfrac": rfrac[n] * 255,
        "gfrac": gfrac[n] * 255,
        "bfrac": bfrac[n] * 255
    } for n in nodes]
    node_map = dict(zip(
        nodes, range(numnodes)))  # map to indices for source/target in edges
    edges_dict = [{
        "source": node_map[edges[i][0]],
        "target": node_map[edges[i][1]]
    } for i in range(numedges)]

    #import json
    json_graph = {"directed": False, "nodes": nodes_dict, "links": edges_dict}
    #json.dump(json_graph,open(save_file_name,'w'))

    print(time.time() - t0)

    return json_graph
Esempio n. 11
0
    hash_results += sub_counts

print(s)
print('Substrings count by naive search: {}'.format(naive_results))
print('Substrings count by Rabin-Karp with hash(): {}'.format(hash_results))

# 2. Закодируйте любую строку из трех слов по алгоритму Хаффмана.

seed(42)

message = input('Введите любую строку: ')
message_list = list(message)

message_symb, message_freq = np_unique(message_list, return_counts=True)

df = pd_DataFrame({'s': message_symb, 'f': message_freq})
message_dict = dict(zip(message_symb, ['' for _ in range(len(message_symb))]))

while df.shape[0] >= 2:
    df.sort_values(by=['f'], inplace=True)  #by=['f', 's'], ascending=True
    i0, i1 = choice([[1, 0], [0, 1]])
    for s in message_dict:
        if s in df.iloc[i0].s:
            message_dict[s] = '0' + message_dict[s]
        if s in df.iloc[i1].s:
            message_dict[s] = '1' + message_dict[s]
    df = df.append(df.iloc[0:2].sum(), ignore_index=True)
    df = df.iloc[2:]

coded_message = message
for s in message_dict: