def merge_json(): final_dict = {} dept_dict = read_json("year_dept.json") category_dict = read_json("year_category.json") for each in dept_dict: each["CREATION YEAR"] = str(each["CREATION YEAR"]) each["ZIP CODE"] = str(int(each["ZIP CODE"])) if each["CREATION YEAR"] not in final_dict.keys(): final_dict[each["CREATION YEAR"]] = {} if each["ZIP CODE"] not in final_dict[each["CREATION YEAR"]].keys(): final_dict[each["CREATION YEAR"]][each["ZIP CODE"]] = {} if "DEPARTMENT" not in final_dict[each["CREATION YEAR"]][each["ZIP CODE"]].keys(): final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["DEPARTMENT"] = [] dept = {"name": each["DEPARTMENT"], "count": each["COUNT"]} final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["DEPARTMENT"].append(dept) for each in category_dict: each["CREATION YEAR"] = str(each["CREATION YEAR"]) each["ZIP CODE"] = str(int(each["ZIP CODE"])) if each["CREATION YEAR"] not in final_dict.keys(): final_dict[each["CREATION YEAR"]] = {} if each["ZIP CODE"] not in final_dict[each["CREATION YEAR"]].keys(): final_dict[each["CREATION YEAR"]][each["ZIP CODE"]] = {} if "CATEGORY" not in final_dict[each["CREATION YEAR"]][each["ZIP CODE"]].keys(): final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["CATEGORY"] = [] dept = {"name": each["CATEGORY"], "count": each["COUNT"]} final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["CATEGORY"].append(dept) write_json(final_dict, "final.json")
def process_time_series_data(): print("values.") data_311 = pd.read_csv(final_data) data_311 = data_311[data_311['CREATION YEAR'] >= 2015] data_311['DAYS TO CLOSE'] = data_311['DAYS TO CLOSE'].apply(lambda x: str(x).replace(",", "")) data_311['DAYS TO CLOSE'] = data_311['DAYS TO CLOSE'].astype("float64") time_series = data_311.groupby(by=['CREATION YEAR', 'CREATION MONTH', 'ZIP CODE', 'DEPARTMENT'])[ 'DAYS TO CLOSE'].mean().reset_index().to_dict('records') write_json(time_series, "time_series.json")
def process_final_data(): data_311 = pd.read_csv(final_data) data_311 = data_311[data_311['CREATION YEAR'] >= 2015] list_records_dept = data_311.groupby(by=['CREATION YEAR', 'ZIP CODE', 'DEPARTMENT'])['CASE ID'].count().reset_index( name='COUNT').to_dict('records') list_records_category = data_311.groupby(by=['CREATION YEAR', 'ZIP CODE', 'CATEGORY'])[ 'CASE ID'].count().reset_index( name='COUNT').to_dict('records') write_json(list_records_dept, "year_dept.json") write_json(list_records_category, "year_category.json")
def save_result_json(output_path, imgname, res): """ 把生成的坐标和图片保存 """ img_op = os.path.join(output_path, "images") file_utils.check_path(img_op) msg_op = os.path.join(output_path, "message") file_utils.check_path(msg_op) txt_op = os.path.join(output_path, "labels") file_utils.check_path(txt_op) ninstance = len(res) for i in range(ninstance): print(colorize(Color.GREEN, 'added into the db %s ' % res[i]['txt'])) temp = res[i] img = temp['img'] word_bb = temp['wordBB'] char_bb = temp['charBB'] txt = temp['txt'] # TODO 有换行符的是两个box new_text = [] for line in txt: arr = line.split("\n") new_text.extend(arr) # (n,4,2) 把顺序调整为正常 word_boxes = np.transpose(word_bb).astype('uint8') # dname = "%s_%d" % (imgname, i) + ".jpg" print(dname, new_text) img_file = os.path.join(img_op, dname) img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) cv2.imwrite(img_file, img) result_json = { 'text': new_text, 'word_pos': word_bb.tolist(), 'char_pos': char_bb.tolist() } json_utils.write_json(msg_op, dname, result_json) # TODO 统一格式 # 写 labels 四点坐标+ 文本,逗号隔开 txt_name = os.path.splitext(dname)[0] + ".txt" f2 = open(os.path.join(txt_op, txt_name), 'w', encoding='utf-8') writer = csv.writer(f2) for j in range(len(txt)): box = word_boxes[j] word = new_text[j] line = np.append(box.reshape(-1), word) writer.writerow(line) f2.close()
def process_dept_df(): counter = 0 dept_df = final_data[['CASE ID', 'DESCRIPTION', 'DEPARTMENT']] for each in list(dept_df.DEPARTMENT.unique()): dept_category[str(counter)] = each counter += 1 write_json(dept_category, dept_json) print('finished') dept_df['label'] = dept_df.DEPARTMENT.apply(apply_dept_label) dept_df.rename(columns={ 'CASE ID': 'u_id', 'DESCRIPTION': 'desc' }, inplace=True) write_to_csv(dept_df, dept_file)
def process_prob_df(): counter = 0 print(list(final_data['REQUEST TYPE'].unique())) final_data['label'] = final_data['REQUEST TYPE'].apply(apply_prob) print(list(final_data.label.unique())) for each in list(final_data.label.unique()): prob_category[str(counter)] = each counter += 1 write_json(prob_category, prob_json) final_data['label'] = final_data['label'].apply(apply_prob_label) prob_df = final_data[['CASE ID', 'DESCRIPTION', 'label']] prob_df.rename(columns={ 'CASE ID': 'u_id', 'DESCRIPTION': 'desc' }, inplace=True) write_to_csv(prob_df, prob_file) print('finished')
def save_log(self, filename, env_to_file=None): assert not filename is None, "filename not supplied. save_log failed" lg = deepcopy(self.log) for _, drone_log in lg["drones"].items(): drone_log["info"] = drone_log["info"].to_JSONable() for step, step_log in drone_log["trajectory"].items(): drone_log["trajectory"][step] = step_log.to_JSONable() if isinstance(self.log["environment"], str): lg["environment"] = {"path": self.log["environment"]} else: if env_to_file is None: lg["environment"] = self.log["environment"].to_JSONable() else: lg["environment"] = {"path": env_to_file} write_json(env_to_file, self.log["environment"].to_JSONable()) write_json(filename, lg)
def get_movie_info(): headers = {'User-Agent': 'Mozilla/5.0 xxxxxx'} basel = 'https://movie.douban.com/subject/1292213/' html = requests.get(basel, headers=headers).content.decode('utf-8', 'ignore') url_content = re.search( r'"@context": "http://schema.org",(.*?)"ratingValue": "9.2"', html, re.S) texts = url_content.group() # 获取匹配正则表达式的整体结果 texts = str("{" + texts + "}}") # important data = json.loads(texts, strict=False) movie_info = { 'name': data['name'], 'author': data['author'], 'actor': data['actor'], 'director': data['director'] } print(movie_info) ju.write_json(data, r'data/data.json')
def process_json(): time_series_dict = read_json("time_series.json") final_dict = {} for each in time_series_dict: each["CREATION YEAR"] = str(each["CREATION YEAR"]) each["ZIP CODE"] = str(int(each["ZIP CODE"])) each["CREATION MONTH"] = str(each["CREATION MONTH"]) if each["CREATION YEAR"] not in final_dict.keys(): final_dict[each["CREATION YEAR"]] = {} if each["ZIP CODE"] not in final_dict[each["CREATION YEAR"]].keys(): final_dict[each["CREATION YEAR"]][each["ZIP CODE"]] = {} if "DEPARTMENT" not in final_dict[each["CREATION YEAR"]][each["ZIP CODE"]].keys(): final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["DEPARTMENT"] = [] dept = { "CREATION MONTH": each["CREATION MONTH"], "NAME": each["DEPARTMENT"], "COUNT": each["DAYS TO CLOSE"] } final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["DEPARTMENT"].append(dept) write_json(final_dict, "time_series_final.json")
if DBG: print "Item satisfies auto_contact_sms for filter but already contacted", f if f.satisfies_auto_contact_email(item): if DBG: print "Item satisfies auto_contact_email for filter", f if not already_auto_contacted_email(item): # Auto contact if not auto_contact_email(item, f): printe("ERROR, something went wrong while trying to auto_contact_email") else: if DBG: print "Item satisfies auto_contact_email for filter but already contacted", f f_existing_items = open(FILTERED_ITEMS_FILEPATH, 'w+') total_items = already_existing_items.values() + new_items # While debugging, use pretty-printed JSON, when not debugging anymore, use compact notation for space efficiency if DBG: json_repr = json.dumps(total_items, indent=4, separators=(',', ': ')) else: json_repr = json.dumps(total_items) f_existing_items.write(json_repr) f_existing_items.close() write_json(passed_alerts, PASSED_ALERTS_FILEPATH) write_json(passed_mail_contacts, PASSED_MAILS_FILEPATH) write_json(passed_sms_contacts, PASSED_SMS_FILEPATH)
def dump_data(self): print "Dumping data to %s" % self.fname_log_items write_json(self.log_items, self.fname_log_items) print "Dumping data to %s" % self.fname_serp_items write_json(self.serp_items, self.fname_serp_items)
for word in segment: if word.strip() not in stopwords: if len(word) > 1: if word != '\t': if word != '\r\n': # 计算词频 if word in word_: word_[word] += 1 else: word_[word] = 1 print(word_) len(word_) del word_['nbsp'] ju.write_json(word_, os.getcwd() + r"/test/text_data.json") word_ = sorted(word_.items(), key=lambda x: x[1], reverse=True) dic_temp = {} for word in word_: dic_temp[word[0]] = word[1] ju.write_json(dic_temp, os.getcwd() + r"/test/text_data.json") dic_temp["姜明"] for word in word_: name.append(word[0]) value.append(word[1]) name.index("雪艳姐") value[184] generatepath = os.getcwd() + r"/test/test_cloud.html" name[:200] value[:200]
def close_spider(self, spider): print "Dumping data to %s" % self.fname write_json(self.items, self.fname)