def run_rolodex(input_file, output_file): """ This program takes an input file of personal information in multiple formats It normalize every valid entry and dumps the ordered result into an output file @param string input_file input file name with path from current dir @param string output_file output file name with path from current dir """ persons = [] error_indices = [] normalizer = Normalizer() with open(input_file) as input_file: for line_number, line in enumerate(input_file, start=1): try: person = normalizer.normalize(line.rstrip()) persons.append(person) except NormalizationException: error_indices.append(line_number) sorted_persons = sorted(persons, key=lambda person: person.__str__()) output_dict = { "entries": sorted_persons, "errors": error_indices } logging.info("Completed, please check output file.") with open(output_file, 'w') as output_file: json.dump(output_dict, output_file, indent=2, sort_keys=True)
def generate(self, edgeCount, tfidf = False, window_size = 0, degree = False, closeness = False, groups= False): parser = XMLDataframeParser() text = parser.getText("./data/smokingRecords.xml") parser.addFeatureFromText(text, "HISTORY OF PRESENT ILLNESS :", "", True, True, "illness") df = parser.getDataframe() df_xml = parser.removeEmptyEntries(df, "illness") normalizer = Normalizer() if tfidf: if window_size == 0: vectorizer = TfidfVectorizer(tokenizer = lambda text: normalizer.normalize(text, True, False), ngram_range = (2, 2)) mostFreq2Grams = self.get_first_n_words(vectorizer, df_xml.illness, edgeCount) else: vectorizer = TfidfVectorizer(analyzer = lambda text: self.custom_analyser(text, 2, int(window_size))) mostFreq2Grams = self.get_first_n_words(vectorizer, normalizer.normalizeArray(df_xml.illness, True, False), edgeCount) else: if window_size == 0: vectorizer = CountVectorizer(tokenizer = lambda text: normalizer.normalize(text, True, False), ngram_range = (2, 2)) mostFreq2Grams = self.get_first_n_words(vectorizer, df_xml.illness, edgeCount) else: vectorizer = CountVectorizer(analyzer = lambda text: self.custom_analyser(text, 2, int(window_size))) mostFreq2Grams = self.get_first_n_words(vectorizer, normalizer.normalizeArray(df_xml.illness, True, False), edgeCount) df_graph = self.create_dataframe(mostFreq2Grams) GF = nx.from_pandas_edgelist(df_graph, 'Node1', 'Node2', ["Weight"]) if degree: # calculate degree centrality degree_centrality = nx.degree_centrality(GF) nx.set_node_attributes(GF, degree_centrality, "degree_centrality") if closeness: # calculate closeness centrality closeness_centrality = nx.closeness_centrality(GF) nx.set_node_attributes(GF, closeness_centrality, "closeness_centrality") if groups: # calculate partitions partition = community.best_partition(GF) nx.set_node_attributes(GF, partition, "group") payload = json_graph.node_link_data(GF) return payload
def main(p): start = time.time() # 选择文件名以'json.gz'结尾的记录 file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p)) # TODO 添加文件是否是24个的判断(glob模块) for file_name in file_name_list: with open(os.path.join(p, file_name), 'r') as f: raw_json_file = gzip.GzipFile(fileobj=f) record_cleaner = Cleaner() record_grouper = Grouper(db) record_normalizer = Normalizer(db) mongo_helper = MongoHelper(db) counter = ActorCounter() evaluater = Evaluater() # 数据清洗 record_cleaner.set_dirty_data(raw_json_file) record_cleaner.clean() clean_record = record_cleaner.get_clean_data() log.log('clean record %s' % len(clean_record)) # 数据处理 # 分组 record_grouper.set_records(clean_record) record_grouper.group() record_actor_exist = record_grouper.get_group_1() record_actor_new= record_grouper.get_group_2() log.log('record_actor_exist: %s' % len(record_actor_exist)) log.log('record_actor_new: %s' % len(record_actor_new)) # 处理记录的actor已存在的记录 log.log('Begin processing actor-exist records...') # 只需要删掉记录的actor_attrs即可 for record in record_actor_exist: del record['actor_attributes'] log.log('Finished.') # 处理记录的actor不存在的记录 record_normalizer.set_records(record_actor_new) record_normalizer.normalize() record_actor_new = record_normalizer.get_record_actor_new() new_actors = record_normalizer.get_new_actors() # 把本地的今日新增的Actor更新到数据库 actors = new_actors.values() mongo_helper.insert_new_actors(actors) # 对新增的Actor, 改变Redis中相应的计数 counter.count_actor_list(actors) # 计算每条记录的val evaluater.set_records(record_actor_exist) evaluater.evaluate() val_actor_exist = evaluater.get_val_cache() evaluater.set_records(record_actor_new) evaluater.evaluate() val_actor_new = evaluater.get_val_cache() # 将记录插入数据库 mongo_helper.insert_new_reocrds(record_actor_new) mongo_helper.insert_new_reocrds(record_actor_exist) # 将今日用户新增的val更新到数据库 mongo_helper.update_val(val_actor_new) mongo_helper.update_val(val_actor_exist) record_cleaner.free_mem() del record_cleaner del record_grouper del record_normalizer del mongo_helper del counter del evaluater # 生成CSV文件 util.grcount2csv() end = time.time() log.log('total: %s s' % (end - start))
def main(p): start = time.time() # 选择文件名以'json.gz'结尾的记录 file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p)) # TODO 添加文件是否是24个的判断(glob模块) for file_name in file_name_list: with open(os.path.join(p, file_name), 'r') as f: raw_json_file = gzip.GzipFile(fileobj=f) record_cleaner = Cleaner() record_grouper = Grouper(db) record_normalizer = Normalizer(db) mongo_helper = MongoHelper(db) counter = ActorCounter() evaluater = Evaluater() # 数据清洗 record_cleaner.set_dirty_data(raw_json_file) record_cleaner.clean() clean_record = record_cleaner.get_clean_data() log.log('clean record %s' % len(clean_record)) # 数据处理 # 分组 record_grouper.set_records(clean_record) record_grouper.group() record_actor_exist = record_grouper.get_group_1() record_actor_new = record_grouper.get_group_2() log.log('record_actor_exist: %s' % len(record_actor_exist)) log.log('record_actor_new: %s' % len(record_actor_new)) # 处理记录的actor已存在的记录 log.log('Begin processing actor-exist records...') # 只需要删掉记录的actor_attrs即可 for record in record_actor_exist: del record['actor_attributes'] log.log('Finished.') # 处理记录的actor不存在的记录 record_normalizer.set_records(record_actor_new) record_normalizer.normalize() record_actor_new = record_normalizer.get_record_actor_new() new_actors = record_normalizer.get_new_actors() # 把本地的今日新增的Actor更新到数据库 actors = new_actors.values() mongo_helper.insert_new_actors(actors) # 对新增的Actor, 改变Redis中相应的计数 counter.count_actor_list(actors) # 计算每条记录的val evaluater.set_records(record_actor_exist) evaluater.evaluate() val_actor_exist = evaluater.get_val_cache() evaluater.set_records(record_actor_new) evaluater.evaluate() val_actor_new = evaluater.get_val_cache() # 将记录插入数据库 mongo_helper.insert_new_reocrds(record_actor_new) mongo_helper.insert_new_reocrds(record_actor_exist) # 将今日用户新增的val更新到数据库 mongo_helper.update_val(val_actor_new) mongo_helper.update_val(val_actor_exist) record_cleaner.free_mem() del record_cleaner del record_grouper del record_normalizer del mongo_helper del counter del evaluater # 生成CSV文件 util.grcount2csv() end = time.time() log.log('total: %s s' % (end - start))
def run_pipeline(): #get training data training_data = pd.read_csv('worldbank-data/WDI_Data.csv') training_data.set_index(['Country Name', 'Indicator Name'], inplace=True) #convert to panel panel = training_data.to_panel() panel.drop(['Indicator Code', 'Country Code'], axis=0, inplace=True) panel = panel.swapaxes(0, 1) indicators_to_use = [ 'Agriculture, value added (% of GDP)', 'Industry, value added (% of GDP)', 'Services, etc., value added (% of GDP)', 'Domestic credit provided by financial sector (% of GDP)', 'GDP growth (annual %)', 'GDP (current US$)', 'Expense (% of GDP)', 'Inflation, consumer prices (annual %)', 'Inflation, GDP deflator (annual %)', 'Total debt service (% of exports of goods, services and primary income)', 'Current account balance (BoP, current US$)', 'External balance on goods and services (% of GDP)', 'Health expenditure, total (% of GDP)', 'Tax revenue (% of GDP)', 'Gross capital formation (% of GDP)', 'Gross savings (% of GDP)', 'Net investment in nonfinancial assets (% of GDP)', 'Bank capital to assets ratio (%)', 'Bank nonperforming loans to total gross loans (%)', 'Broad money (% of GDP)', 'Commercial bank branches (per 100,000 adults)', 'Deposit interest rate (%)', 'Real interest rate (%)', 'Risk premium on lending (lending rate minus treasury bill rate, %)', 'Total reserves (includes gold, current US$)', 'Unemployment, total (% of total labor force) (modeled ILO estimate)', 'Interest rate spread (lending rate minus deposit rate, %)' ] print len(indicators_to_use), 'indicators used' panel = panel[:, :, indicators_to_use] target_variables = [ 'Agriculture, value added (% of GDP)', 'Industry, value added (% of GDP)', 'Services, etc., value added (% of GDP)', 'GDP growth (annual %)', 'Inflation, GDP deflator (annual %)', 'Gross capital formation (% of GDP)', 'Gross savings (% of GDP)', 'Bank capital to assets ratio (%)', 'Bank nonperforming loans to total gross loans (%)', 'Deposit interest rate (%)', 'Real interest rate (%)', 'Risk premium on lending (lending rate minus treasury bill rate, %)', 'Unemployment, total (% of total labor force) (modeled ILO estimate)', 'Interest rate spread (lending rate minus deposit rate, %)' ] #drop useless countries such as samoa, lesoto and so on. useful_countries = [] for country in panel.axes[0]: if find_null_percentage(panel[country, :, :]) < 0.7: useful_countries.append(country) panel = panel.ix[useful_countries, :, :] normalizer = Normalizer(panel) normalized_panel = normalizer.normalize(panel) # #visualize normalization: # for indicator in normalized_panel.axes[2]: # plot_hist(indicator, [panel, normalized_panel]) # select train data years_to_validate = 1 years_to_predict = 10 years_train = generate_year_list(stop=2016 - years_to_validate) years_val = generate_year_list(start=2016 - years_to_validate + 1) years_predict = generate_year_list(start=2017, stop=2016 + years_to_predict) train_panel = normalized_panel[:, years_train, :].copy() # fill missing values: # either banal mean or median filling # or sampling with a generative bidirectional LSTM - see https://arxiv.org/abs/1306.1091 generative_model = dense_generative_model(train_panel, hidden_layers=[120], epochs=100) sampled_filled_values = iterative_fill(generative_model, train_panel, normalizer, iterations=50, burn_in=10) train_panel.update(sampled_filled_values, overwrite=False) # or # train_panel.fillna(0, inplace=True) # or # train_panel = iterative_fill_bLSTM(train_panel) # or # filled_panel = fill_missing_bLSTM(train_panel, epochs=100) # train_panel.update(filled_panel, overwrite=False) # or # interpolate(train_panel) # create 1-step-ahead model epochs = 200 hl = [100, 100] print "ARCHITECTURE:", hl print 'EPOCHS:', epochs X_train = train_panel[:, years_train, :][:, :-1, :] y_train = train_panel[:, years_train, :][:, 1:, :] model = dense_gradient_model(X_train, y_train, hidden_layers=hl, d=0.2, patience=50, epochs=epochs) # finally, predict for start, year in enumerate(years_val + years_predict): predictions = model.predict(train_panel[:, start + 1:, :].values)[:, -1, :] train_panel = train_panel.swapaxes(0, 1) new_year_df = pd.DataFrame(data=predictions, index=train_panel.axes[1], columns=y_train.axes[2]) train_panel[year] = new_year_df train_panel = train_panel.swapaxes(0, 1) print "score:", rmse( normalized_panel[:, years_val, target_variables].values, train_panel[:, years_val, target_variables].values) #revert to original scale and distributions train_panel = normalizer.renormalize(train_panel) #convert to dataframe, and write relevant information to file target_countries = ['Bulgaria', 'Cyprus', 'Albania'] train_panel = train_panel.swapaxes(0, 1) df = train_panel[:, target_countries, target_variables].to_frame(filter_observations=False) df.to_csv('Predictions.csv')
_text = text for func in funcs: _text = func(_text) return _text if __name__ == '__main__': import sys from normalize import Normalizer fd = open(sys.argv[1]) if len(sys.argv) >= 2 else sys.stdin ps = JaWikiPreprocess() norm = Normalizer() for _line in (_.strip() for _ in fd): for line in _line.split("。"): try: conv = ps.execute(line) if conv: print(norm.normalize(conv + "。")) except KeyboardInterrupt: exit() except: traceback.print_exc() # for file in jawiki-latest*.txt; do python # ~/Projects/cabocha/jawiki_preprocess.py $file >preprocess/$file.pre.txt; # done # for file in ~/jawiki/preprocess/jawiki-latest*.txt; # do python ~/Projects/cabocha/case_mongo.py $file || exit; done