コード例 #1
0
def test_load_country_continent():
    ccdf = utils.load_country_continent()
    logging.debug(f"{len(ccdf)} countries parsed")
    cont = ccdf >> group_by(X.Continent_Code, X.Continent_Name) >> summarize(num_countries=n(X.Country_Number))
    logging.debug(cont)
    assert cont.shape[0] == 6
コード例 #2
0
def test_load_countries():
    cdf = utils.load_countries()
    cont = cdf >> group_by(X.continent_code) >> summarize(num_countries=n(X.country_code))
    logging.debug(cont)
    assert {'country_code', 'continent_code', 'country_name'}.issubset(set(cdf.columns))
コード例 #3
0
# Solution 2 (as series of list)
emails.str.findall(pattern, flags=re.IGNORECASE)
# Solution 3 (as list)
[x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0]



#%% Mean of a series grouped by another series
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())

import dfply as dplyr
df=pd.DataFrame({'weights':weights, 'fruits':fruit})
df >> dplyr.group_by(X.fruits) >> dplyr.summarize(mean=X.weights.mean())
weights.groupby(fruit).mean()

#%% Local maxima/minima in a series

ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# Solution
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1

#%% Fill Nas
df = pd.DataFrame({"A":[None, 1, 2, 3, None, None],  
                   "B":[11, 5, None, None, None, 8], 
                   "C":[None, 5, 10, 11, None, 8]}) 
df.bfill(axis ='rows') 
コード例 #4
0
    # 통합 데이터 전처리
    # ***********************************************
    fileInfo1 = glob.glob('{}/{}'.format(globalVar['inpPath'],
                                         'LSH0167_dataL2.csv'))
    dataL2 = pd.read_csv(fileInfo1[0], na_filter=False)

    # breakpoint()

    # ***********************************************
    # 데이터 요약 (요약통계량)
    # ***********************************************
    # 연소득당 거래금액 따른 기초 통계량
    dataL2.describe()

    # 법정동에 따른 연소득당 거래금액 따른 기초 통계량
    dataL3 = ((dataL2 >> dfply.group_by(dfply.X.d2) >>
               dfply.summarize(meanVal=dfply.mean(dfply.X.val))))

    # *******************************************************
    # 데이터 요약 (표/그래프 활용)
    # *******************************************************
    # 연소득당 거래금액 따른 히스토그램
    saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName,
                                    '연소득당 거래금액 따른 히스토그램')

    sns.distplot(dataL2['val'], kde=True, rug=False)
    plt.show()
    plt.savefig(saveImg, dpi=600, bbox_inches='tight')

    # 법정동에 따른 연소득당 거래금액 히스토그램
    saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName,
コード例 #5
0
    def exec(self):

        try:
            log.info('[START] {}'.format("exec"))

            # breakpoint()

            # python -c "import nltk; nltk.download('punkt')"
            # nltk.download('stopwords')

            # 1) https://edition.cnn.com/2020/06/02/world/nodosaur-fossil-stomach-contents-scn-trnd/index.html에서 기사 내용을 스크랩하십시오.
            html = urlopen(
                "https://edition.cnn.com/2020/06/02/world/nodosaur-fossil-stomach-contents-scn-trnd/index.html"
            )
            # html = requests.get(url)
            soup = BeautifulSoup(html, 'html.parser')

            section = soup.select('section.zn-body-text')

            liGetText = []
            for i in section:
                getText = i.get_text()

                log.info("getText : {%s} : {%s}", len(getText), getText)

                # 단어 추출
                wordTokens = word_tokenize(getText)
                # 불용어
                stopWords = set(stopwords.words('english'))

                # log.info("wordTokens : {%s} : {%s}", len(wordTokens), wordTokens)
                # log.info("stopWords : {%s} : {%s}", len(stopWords), stopWords)

                # 2) 기사 내용을 사전 처리하여 불용어없이 단수 명사 목록을 얻습니다.
                for j in wordTokens:
                    if j not in stopWords:
                        liGetText.append(j)

            log.info("liGetText : {%s} : {%s}", len(liGetText), liGetText)

            data = pd.DataFrame({'type': liGetText})

            # 3) 빈도분포 및 워드 클라우드 시각화
            dataL1 = ((data >> filter_by(
                X.type != '.', X.type != ',', X.type != "'", X.type != "''",
                X.type != "``", X.type != "'s") >> group_by(X.type) >>
                       summarize(number=n(X.type)) >> ungroup() >> arrange(
                           X.number, ascending=False)))

            log.info("dataL1 : {%s} : {%s}", len(dataL1), dataL1)

            # 데이터 시각화를 위한 전처리
            objData = {}
            for i in dataL1.values:
                key = i[0]
                val = i[1]

                objData[key] = val

            log.info("objData : {%s} : {%s}", len(objData), objData)

            wordcloud = WordCloud(
                width=1000, height=1000,
                background_color="white").generate_from_frequencies(objData)

            saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                        '워드 클라우드.png')
            log.info('[CHECK] saveFile : {}'.format(saveImg))

            plt.imshow(wordcloud, interpolation="bilinear")
            plt.axis("off")
            plt.savefig(saveImg, dpi=600, bbox_inches='tight')
            plt.show()

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))