def test_load_country_continent(): ccdf = utils.load_country_continent() logging.debug(f"{len(ccdf)} countries parsed") cont = ccdf >> group_by(X.Continent_Code, X.Continent_Name) >> summarize(num_countries=n(X.Country_Number)) logging.debug(cont) assert cont.shape[0] == 6
def test_load_countries(): cdf = utils.load_countries() cont = cdf >> group_by(X.continent_code) >> summarize(num_countries=n(X.country_code)) logging.debug(cont) assert {'country_code', 'continent_code', 'country_name'}.issubset(set(cdf.columns))
# Solution 2 (as series of list) emails.str.findall(pattern, flags=re.IGNORECASE) # Solution 3 (as list) [x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0] #%% Mean of a series grouped by another series fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10)) weights = pd.Series(np.linspace(1, 10, 10)) print(weights.tolist()) print(fruit.tolist()) import dfply as dplyr df=pd.DataFrame({'weights':weights, 'fruits':fruit}) df >> dplyr.group_by(X.fruits) >> dplyr.summarize(mean=X.weights.mean()) weights.groupby(fruit).mean() #%% Local maxima/minima in a series ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3]) # Solution dd = np.diff(np.sign(np.diff(ser))) peak_locs = np.where(dd == -2)[0] + 1 #%% Fill Nas df = pd.DataFrame({"A":[None, 1, 2, 3, None, None], "B":[11, 5, None, None, None, 8], "C":[None, 5, 10, 11, None, 8]}) df.bfill(axis ='rows')
# 통합 데이터 전처리 # *********************************************** fileInfo1 = glob.glob('{}/{}'.format(globalVar['inpPath'], 'LSH0167_dataL2.csv')) dataL2 = pd.read_csv(fileInfo1[0], na_filter=False) # breakpoint() # *********************************************** # 데이터 요약 (요약통계량) # *********************************************** # 연소득당 거래금액 따른 기초 통계량 dataL2.describe() # 법정동에 따른 연소득당 거래금액 따른 기초 통계량 dataL3 = ((dataL2 >> dfply.group_by(dfply.X.d2) >> dfply.summarize(meanVal=dfply.mean(dfply.X.val)))) # ******************************************************* # 데이터 요약 (표/그래프 활용) # ******************************************************* # 연소득당 거래금액 따른 히스토그램 saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, '연소득당 거래금액 따른 히스토그램') sns.distplot(dataL2['val'], kde=True, rug=False) plt.show() plt.savefig(saveImg, dpi=600, bbox_inches='tight') # 법정동에 따른 연소득당 거래금액 히스토그램 saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName,
def exec(self): try: log.info('[START] {}'.format("exec")) # breakpoint() # python -c "import nltk; nltk.download('punkt')" # nltk.download('stopwords') # 1) https://edition.cnn.com/2020/06/02/world/nodosaur-fossil-stomach-contents-scn-trnd/index.html에서 기사 내용을 스크랩하십시오. html = urlopen( "https://edition.cnn.com/2020/06/02/world/nodosaur-fossil-stomach-contents-scn-trnd/index.html" ) # html = requests.get(url) soup = BeautifulSoup(html, 'html.parser') section = soup.select('section.zn-body-text') liGetText = [] for i in section: getText = i.get_text() log.info("getText : {%s} : {%s}", len(getText), getText) # 단어 추출 wordTokens = word_tokenize(getText) # 불용어 stopWords = set(stopwords.words('english')) # log.info("wordTokens : {%s} : {%s}", len(wordTokens), wordTokens) # log.info("stopWords : {%s} : {%s}", len(stopWords), stopWords) # 2) 기사 내용을 사전 처리하여 불용어없이 단수 명사 목록을 얻습니다. for j in wordTokens: if j not in stopWords: liGetText.append(j) log.info("liGetText : {%s} : {%s}", len(liGetText), liGetText) data = pd.DataFrame({'type': liGetText}) # 3) 빈도분포 및 워드 클라우드 시각화 dataL1 = ((data >> filter_by( X.type != '.', X.type != ',', X.type != "'", X.type != "''", X.type != "``", X.type != "'s") >> group_by(X.type) >> summarize(number=n(X.type)) >> ungroup() >> arrange( X.number, ascending=False))) log.info("dataL1 : {%s} : {%s}", len(dataL1), dataL1) # 데이터 시각화를 위한 전처리 objData = {} for i in dataL1.values: key = i[0] val = i[1] objData[key] = val log.info("objData : {%s} : {%s}", len(objData), objData) wordcloud = WordCloud( width=1000, height=1000, background_color="white").generate_from_frequencies(objData) saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName, '워드 클라우드.png') log.info('[CHECK] saveFile : {}'.format(saveImg)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.savefig(saveImg, dpi=600, bbox_inches='tight') plt.show() except Exception as e: log.error("Exception : {}".format(e)) raise e finally: log.info('[END] {}'.format("exec"))