def get_data(): client = Client("http://t3.dev:50070", "hadoop") # client = InsecureClient(url="http://t3.dev:50070", user="******", root="/") print(client.list("/huiqu/common/area.txt")) with client.read("/huiqu/common/area.txt/part-00000") as read: # print(read.read().decode('utf8')) return {"data": read.read().decode('utf8')}
print(rowkey) mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \ Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \ Mutation(column=self.columnFamily + ":user_id", value=user_id), \ Mutation(column=self.columnFamily + ":link", value=link) ] # 一次提交多行 mutations_batch.append( BatchMutation(row=rowkey, mutations=mutations)) if len(mutations_batch) % batch_size == 0: self.client.mutateRows(self.tablename, mutations_batch) mutations_batch = [] if __name__ == "__main__": # 建立hbase连接 hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log') hbasewriteer.createTable() # 连接HDFS client = Client(HDFSNN) # 获取文件列表 logFiles = client.list(LOGPATH) # 读取文件 for logfile in logFiles: with client.read(os.path.join(LOGPATH, logfile)) as deal_file_handle: hbasewriteer.importData(deal_file_handle)
class ChatBotModel(object): def __init__(self, hadoop_url, hdfs_index_file, local_index_file, corpus_dir, unk_answer='', max_answer_len=1024): self.hadoop_url = hadoop_url self.hdfs_index_file = hdfs_index_file self.local_index_file = local_index_file self.corpus_dir = corpus_dir self.max_answer_len = max_answer_len self.unk_answer = unk_answer self.client = None self.inverted_index = {} def build_connection(self): self.client = Client(self.hadoop_url) def fetch_index_file(self): self.client.download(hdfs_path=self.hdfs_index_file, local_path=self.local_index_file, overwrite=True) def load_inverted_index(self): with open(self.local_index_file, 'r', encoding='utf-8') as f: for line in f: word, *querys = line.strip().split('\t') for query in querys: file_name, query_id, score = query.split(':') if word in self.inverted_index: self.inverted_index[word].append( [file_name, int(query_id), float(score)]) else: self.inverted_index[word] = [] self.inverted_index[word].append( [file_name, int(query_id), float(score)]) def prepare(self): self.build_connection() self.fetch_index_file() self.load_inverted_index() def read_corpus_answer(self, file_name, query_id): file_path = os.path.join(self.corpus_dir, file_name) file_status = self.client.status(file_path) if file_status['length'] <= query_id: return None with self.client.read(hdfs_path=file_path, offset=query_id, length=self.max_answer_len, encoding='utf-8') as f: answer = f.read().strip().split('\n')[0] return answer def predict_answer(self, query): words = jieba.lcut_for_search(query) querys = {} for word in words: if word not in self.inverted_index: continue for file_name, query_id, score in self.inverted_index[word]: query = (file_name, query_id) if query in querys: querys[query] += score else: querys[query] = score if len(querys) == 0: return self.unk_answer best_query = max(querys.items(), key=lambda x: x[1]) (best_file_name, best_query_id), best_score = best_query best_answer = self.read_corpus_answer(best_file_name, best_query_id) if best_answer is None: best_answer = self.unk_answer return best_answer
import pandas as pd import os from hdfs import Client # 目前读取hdfs文件采用方式: # 1. 先从hdfs读取二进制数据流文件 # 2. 将二进制文件另存为.csv # 3. 使用pandas读取csv文件 HDFSHOST = "http://172.16.18.112:50070" train_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19761" test_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19762" train_FILENAME = train_path + "/data/Data.csv" #hdfs文件路径 test_FILENAME = test_path + "/data/Data.csv" #hdfs文件路径 client = Client(HDFSHOST) with client.read(train_FILENAME) as tr_s: tr_content = tr_s.read() tr_s = str(tr_content, 'utf-8') # 确保文件写入完毕 tr_file = open("trainData.csv", "w") tr_file.flush() os.fsync(tr_file) tr_file.write(tr_s) tr_file.close() # 读取文件 df_train = pd.read_csv("trainData.csv", header=0) print(df_train) with client.read(test_FILENAME) as te_fs: te_content = te_fs.read() te_s = str(te_content, 'utf-8')
def train(train_path, test_path, output_path, target, train_split_ratio=0.33, penalty='l2', dual=False, tol=1e-4, C=1.0, random_state=None, multi_class='ovr'): # 设置起始时间 time.localtime() time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format( y='/', m='/', d='', h=':', f=':', s='')) start_time = time.time() # 设置输入文件路径 train_FILENAME = train_path + "/data/Data.csv" # hdfs文件路径 test_FILENAME = test_path + "/data/Data.csv" # hdfs文件路径 client = Client(HDFS_HOSTS1) # 训练数据读取 with client.read(train_FILENAME) as tr_s: tr_content = tr_s.read() tr_s = str(tr_content, 'utf-8') # 确保文件写入完毕 tr_file = open("trainData.csv", "w") tr_file.flush() os.fsync(tr_file) tr_file.write(tr_s) tr_file.close() df_train = pd.read_csv("trainData.csv", header=0) print(df_train) # 测试数据读取 with client.read(test_FILENAME) as te_fs: te_content = te_fs.read() te_s = str(te_content, 'utf-8') # 确保文件写入完毕 te_file = open("testData.csv", "w") te_file.flush() os.fsync(te_file) te_file.write(te_s) te_file.close() df_test = pd.read_csv("testData.csv", header=0) print(df_test) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) test_data_num = df_train.shape[0] train_data_num = df_train.shape[0] # 处理预测集 df_test = min_max_scaler.fit_transform(df_test) df_test = np.array(df_test) # 数据处理和清洗 cols = [tmp_i for tmp_i in df_train.columns if tmp_i not in [target]] X = df_train[cols] X = np.array(X) X = min_max_scaler.fit_transform(X) Y = df_train[target] Y = np.array(Y) # 训练集数据分割 X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=train_split_ratio) # 使用 scikit learn 中的LR模型进行训练 clf = LogisticRegression(penalty, dual, tol, C, random_state, multi_class, solver='liblinear') clf.fit(X_train, Y_train) # 准确率train_acc train_acc = clf.score(X_test, Y_test) print('score Scikit learn: ', train_acc) # 精确率train_precision_score train_precision_score = precision_score(Y_test, clf.predict(X_test)) # 召回率train_recall_score train_recall_score = recall_score(Y_test, clf.predict(X_test)) # F1_Score train_f1_score = f1_score(Y_test, clf.predict(X_test)) # roc_auc_score train_roc_auc_score1 = roc_auc_score(Y_test, clf.predict(X_test)) # 使用 scikit learn 中的LR模型进行预测 result = clf.predict(df_test) # print(result) # 设置终止时间,并计算总时间 train_end = time.time() train_seconds = train_end - start_time m, s = divmod(train_seconds, 60) h, m = divmod(m, 60) time_trains_all = "%02d:%02d:%02d" % (h, m, s) # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++# ## 保存摘要模型报告文件 # abstract_path = HDFS_HOSTS1 + output_path + '/abstract/data/' abstract_path = output_path + '/abstract/data/' f = open('abstract.csv', mode='w', newline='') fileheader = [ 'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.FrameWork = 'Scikit-learn' csv_dict.Version = sklearn.__version__ csv_dict.model = '%s' % LogisticRegression csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(abstract_path + 'abstract.csv') client.upload(abstract_path + 'abstract.csv', 'abstract.csv') # if len(client.list(abstract_path)): # client.delete(abstract_path + 'abstract.csv') # client.upload(abstract_path + 'abstract.csv', 'abstract.csv') # else: # client.upload(abstract_path + 'abstract.csv', 'abstract.csv') ##保存模型版本信息csv文件 version_path = output_path + '/msg/data/' f = open('msg.csv', mode='w', newline='') fileheader = [ 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(version_path + 'msg.csv') client.upload(version_path + 'msg.csv', 'msg.csv') ## 保存训练评价指标模型报告文件 file_csv_path = output_path + '/evaluation/data/' f = open('evaluation.csv', mode='w', newline='') fileheader = [ 'accuracy', 'train_precision_score', 'train_recall_score', 'train_f1_score', 'train_roc_auc_score1' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.train_precision_score = train_precision_score csv_dict.train_recall_score = train_recall_score csv_dict.train_f1_score = train_f1_score csv_dict.train_roc_auc_score1 = train_roc_auc_score1 w.writerow(csv_dict) f.close() client.delete(file_csv_path + 'evaluation.csv') client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv') # 保存测试集预测结果文件 file_csv_path = output_path + '/result/data/' # 字典中的key值即为csv中列名 dataframe = pd.DataFrame({target: result}) # 将DataFrame存储为csv,index表示是否显示行名,default=True dataframe.to_csv("result.csv", index=False, sep=',') client.delete(file_csv_path + 'result.csv') client.upload(file_csv_path + 'result.csv', 'result.csv')
rowkey = md5(str(user_id) + str(visitTime)) print(rowkey) mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \ Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \ Mutation(column=self.columnFamily + ":user_id", value=user_id), \ Mutation(column=self.columnFamily + ":link", value=link) ] self.client.mutateRow(self.tablename, rowkey, mutations) if __name__ == "__main__": # 建立hbase连接 hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log') hbasewriteer.createTable() # 连接HDFS client = Client(HDFSNN, timeout=200000) # 获取文件列表 logFiles = client.list(LOGPATH) # 读取文件 for logfile in logFiles: with client.read(LOGPATH + logfile) as fp: for line in fp: record = line.split(" ") hbasewriteer.importData(record)
def interface(train_path, test_path, output_path, target, chaid_ratio, train_split_ratio=0.3, n_estimators=100, max_depth=5, min_samples_split=3, min_samples_leaf=2, min_split_gain=0.0, colsample_bytree="log2", subsample=0.8, random_state=100): # 设置起始时间 time.localtime() time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format( y='/', m='/', d='', h=':', f=':', s='')) start_time = time.time() # 设置输入文件路径 train_FILENAME = train_path + "/data/Data.csv" # hdfs文件路径 test_FILENAME = test_path + "/data/Data.csv" # hdfs文件路径 client = Client(HDFS_HOSTS1) # 训练数据读取 with client.read(train_FILENAME) as tr_s: tr_content = tr_s.read() tr_s = str(tr_content, 'utf-8') # 确保文件写入完毕 tr_file = open("trainData.csv", "w") tr_file.flush() os.fsync(tr_file) tr_file.write(tr_s) tr_file.close() df_train = pd.read_csv("trainData.csv", header=0) print(df_train) # 测试数据读取 with client.read(test_FILENAME) as te_fs: te_content = te_fs.read() te_s = str(te_content, 'utf-8') # 确保文件写入完毕 te_file = open("testData.csv", "w") te_file.flush() os.fsync(te_file) te_file.write(te_s) te_file.close() df_test = pd.read_csv("testData.csv", header=0) print(df_test) test_data_num = df_train.shape[0] train_data_num = df_train.shape[0] # 卡方检测选出和label列最相关的前chaid_ratio(默认值为前80%)的列 ch2 = SelectKBest(chi2, k=int(df_train.shape[1] * chaid_ratio)) chi_df_train = pd.DataFrame(ch2.fit_transform(df_train, df_train[target])) label_df = df_train[target] # wine数据集 和 sonar 数据集 clf = RandomForestClassifier(n_estimators, max_depth, min_samples_split, min_samples_leaf, min_split_gain, colsample_bytree, subsample, random_state) # 数据集分割与训练 train_count = int(train_split_ratio * len(chi_df_train)) clf.fit(chi_df_train.ix[:train_count], label_df.ix[:train_count]) train_acc = metrics.accuracy_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) print("模型的准确率:", train_acc) # 精确率 train_precision_score = metrics.precision_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) # 召回率 train_recall_score = metrics.recall_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) # F1_Score train_f1_score = metrics.f1_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) # roc_auc_score train_roc_auc_score1 = metrics.roc_auc_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) # 对测试集进行处理,保证其和训练集卡方检测后的列数一致 ch2_list = list(ch2.get_support()) ch2_list.pop() df_test_head = list(df_test.columns) for x, y in zip(ch2_list, df_test_head): if x == False: df_test_head.remove(y) df_test = df_test[df_test_head] # 预测 result = clf.predict(df_test) # print(result) # 设置终止时间,并计算总时间 train_end = time.time() train_seconds = train_end - start_time m, s = divmod(train_seconds, 60) h, m = divmod(m, 60) time_trains_all = "%02d:%02d:%02d" % (h, m, s) # print(time_trains_start,time_trains_all) # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++# ## 保存摘要模型报告文件 abstract_path = output_path + '/abstract/data/' f = open('abstract.csv', mode='w', newline='') fileheader = [ 'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.FrameWork = 'Scikit-learn' csv_dict.Version = sklearn.__version__ csv_dict.model = '%s' % RandomForestClassifier csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(abstract_path + 'abstract.csv') client.upload(abstract_path + 'abstract.csv', 'abstract.csv') ##保存模型版本信息csv文件 version_path = output_path + '/msg/data/' f = open('msg.csv', mode='w', newline='') fileheader = [ 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(version_path + 'msg.csv') client.upload(version_path + 'msg.csv', 'msg.csv') ## 保存训练评价指标模型报告文件 file_csv_path = output_path + '/evaluation/data/' f = open('evaluation.csv', mode='w', newline='') fileheader = [ 'accuracy', 'train_precision_score', 'train_recall_score', 'train_f1_score', 'train_roc_auc_score1' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.train_precision_score = train_precision_score csv_dict.train_recall_score = train_recall_score csv_dict.train_f1_score = train_f1_score csv_dict.train_roc_auc_score1 = train_roc_auc_score1 w.writerow(csv_dict) f.close() client.delete(file_csv_path + 'evaluation.csv') client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv') # 保存测试集预测结果文件 file_csv_path = output_path + '/result/data/' dataframe = pd.DataFrame({target: result}) dataframe.to_csv("result.csv", index=False, sep=',') client.delete(file_csv_path + 'result.csv') client.upload(file_csv_path + 'result.csv', 'result.csv')
def post(self, request, *args, **kwargs): import uuid permission_classes = (IsAuthenticated,) start_time = time.time() file_serializer = DatasourceSerializer(data=request.data) if file_serializer.is_valid(): path = file_serializer.validated_data['file_name'] user = request.user.id # 上传文件的大小 filesize = round((path.size) / 1024 / 1024, 2) # 获取该用户所有文件的大小 mydata_id = DataSource.objects.filter(user_id=user) myData_size = 0 for i in mydata_id: try: x = i.fileSize.replace("KB", '') myData_size += float(x) except: continue myData_size = round(myData_size / 1024, 2) # 单位MB # 该用户即将上传文件加本身有的大小 now_userDataSize = filesize + myData_size # 查找用户所限制文件上传容量的大小 user_dataSize_old = UserLevel.objects.get(user_id=user).up_load print(type(user_dataSize_old)) if now_userDataSize > user_dataSize_old: return Response({'msg': '您的数据容量不足,请清理数据后在尝试', 'status': False}) # if 1 > 2: # pass else: try: is_header = file_serializer.validated_data['is_header'] # user =1 separator = file_serializer.validated_data['column_delimiter'] except: # 数据库文件没有表头,所以设置 is_header = '' separator = '\n' last = (str(path).lower()).split('.')[-1].upper() if last == 'CSV' or last == 'TXT' or last == 'SQL': if path.size > LIMIT_FILE_SIZE: format_name = uuid.uuid1() file_serializer.validated_data['format_filename'] = format_name file_serializer.save() client = Client(HDFS_HOST) file_path = os.path.join(settings.MEDIA_ROOT, str(path)) with open(file_path, 'rb') as f1: # 判断文件的编码 data_type = chardet.detect(f1.readline())['encoding'] if data_type == 'None': return Response({'msg': '数据格式有误', 'status': False}) os.renames(file_path, os.path.join(settings.MEDIA_ROOT, str(format_name))) client.upload("/datahoop", os.path.join(settings.MEDIA_ROOT, str(format_name)), n_threads=4) os.remove(os.path.join(settings.MEDIA_ROOT, str(format_name))) try: with client.read('/datahoop/' + str(format_name), encoding=data_type) as reader: filesize = ((client.status('/datahoop/' + str(format_name)))['length']) / 1024 filesize = str(round(filesize, 2)) + 'KB' reader = reader.readlines() except: return Response({'msg': '数据读取失败', 'status': False}) column_delimiter = separator if is_header == 1: title = (reader[0]).split(column_delimiter) json = {} s = ((reader[0]).split(column_delimiter)) for i in s: json[i.replace('\r\n', '')] = [typ.StringType, True] print(json) else: total = len((reader[0]).split(column_delimiter)) title = [] for i in range(total): title.append('_C' + str(i)) json = {} for i in title: json[i] = [typ.StringType, True] column_num = len((reader[0]).split(column_delimiter)) row_num = len(reader) DataSource.objects.filter(format_filename=format_name).update(user_id=user, title=title[:20], fileSize=filesize, where='hdfs', row_num=row_num, column_num=column_num) over_time = time.time() print('ID为<%s>用户--数据上传<%s>文件的时间为--<%s>秒' % (user, format_name, over_time - start_time)) return Response({'msg': '数据存储成功', 'status': True}) else: global object_id filePath = os.path.join(settings.MEDIA_ROOT, str(path)) file_serializer.save() filesize = str(round((path.size) / 1024, 2)) + 'KB' if last == 'XLS' or last == 'XLSX': pass elif last == 'TXT': object_id = tools.save_mongo_txt(filePath, user, is_header, separator, str(path)) if object_id != 'none': file_serializer.validated_data['obj_id'] = object_id file_serializer.validated_data['file_name'] = str(path) file_serializer.save() else: DataSource.objects.filter(file_name=str(path), user=1).delete() os.remove(os.path.join(settings.MEDIA_ROOT, str(path))) return Response({'msg': '数据格式有误', 'status': False}) elif last == 'CSV': object_id = tools.save_mongo_csv(filePath, user, is_header, separator, str(path)) if object_id != 'none': file_serializer.validated_data['obj_id'] = object_id file_serializer.validated_data['file_name'] = str(path) file_serializer.save() else: # uuid = uuid.uuid1() # file_serializer.validated_data['obj_id'] = uuid # file_serializer.validated_data['file_name'] = str(path) # file_serializer.save() DataSource.objects.filter(file_name=str(path), user=1).delete() os.remove(os.path.join(settings.MEDIA_ROOT, str(path))) return Response({'msg': '数据格式有误', 'status': False}) elif last == 'SQL': try: object_id = tools.save_mongo_sql(filePath, user) file_serializer.validated_data['obj_id'] = object_id file_serializer.validated_data['file_name'] = str(path) file_serializer.save() except Exception as e: DataSource.objects.filter(file_name=str(path), user=1).delete() os.remove(os.path.join(settings.MEDIA_ROOT, str(path))) return Response({'msg': '数据格式有误', 'status': False}) with open(filePath, 'rb') as f1: # 判断文件的编码 data_type = chardet.detect(f1.readline())['encoding'] with open(filePath, encoding=data_type, errors='ignore') as reader: # 按编码读文件 reader = reader.readlines() if is_header == 1: title = (reader[0]).split(separator) json = {} s = ((reader[0]).split(separator)) for i in s: json[i.replace('\r\n', '')] = [typ.StringType, True] column_num = len((reader[0]).split(separator)) else: if last != 'SQL': total = len((reader[0]).split(separator)) title = [] for i in range(total): title.append('_C' + str(i)) json = {} for i in title: json[i] = [typ.StringType, True] column_num = len((reader[0]).split(separator)) else: total = re.findall(r'[^()]+', reader[0])[1].split(',') title = [] for i in range(len(total)): title.append('_C' + str(i)) json = {} for i in title: json[i] = [typ.StringType, True] column_num = len(total) row_num = len(reader) DataSource.objects.filter(obj_id=object_id).update(user_id=user, title=title[:20], fileSize=filesize, where='mongodb', row_num=row_num, column_num=column_num) os.remove(os.path.join(settings.MEDIA_ROOT, str(path))) over_time = time.time() print('ID为<%s>用户--数据上传<%s>文件的时间为--<%s>秒' % (user, path, over_time - start_time)) return Response({'msg': '数据存储成功', 'status': True}) else: return Response({'msg': '暂不支持此类文件上传', 'status': False}) else: return Response({'msg': '不是一个有效的数据', 'status': False})
def get(self, request, detail_id): id = detail_id view_num = DataSource.objects.get(id=id).view_num DataSource.objects.filter(id=id).update(view_num=view_num + 1) user_id = request.user.id object = DataSource.objects.get(id=id) where = object.obj_id if where == '': try: file = object.format_filename file = str(file) header = 'media/' + str(object.user.image) username = object.user.username hdfs_path = '/datahoop/' + file client = Client(HDFS_HOST) json = {} json['header'] = header file_user_id = object.user.id relation = Relationship.objects.filter(author_id=user_id) all = [] for i in relation: all.append(i.User_ByID) if str(file_user_id) in all: json['is_focus'] = 1 else: json['is_focus'] = 0 objects = Collect.objects.filter(source=1, user=user_id, file_id=id) if objects: json['is_collect'] = 1 else: json['is_collect'] = 0 love = Love.objects.filter(user=user_id, file_id=id, source=1) if love: json['is_love'] = 1 else: json['is_love'] = 0 file_user_id = object.user_id if user_id == file_user_id: json['is_me'] = 1 else: json['is_me'] = 0 json['file_name'] = str(object.file_name) json['title'] = object.detail json['hdfs'] = object.format_filename json['fav_num'] = object.fav_num json['view_num'] = object.view_num json['thumb_num'] = object.thumb_num json['label'] = [object.label_name, ''] json['username'] = username json['data'] = [] with client.read(hdfs_path, encoding='utf-8') as reader: for i in (reader.readlines())[0:20]: print(i) json['data'].append(i.split(',')) return Response(json, status=status.HTTP_200_OK) except Exception as e: print(e) return Response(status=status.HTTP_404_NOT_FOUND) else: try: client = pymongo.MongoClient(MONGO_DB_URI) db = client.datahoop.data json = {} fileName = str(object.file_name) obj_id = object.obj_id file_user_id = object.user.id relation = Relationship.objects.filter(author_id=user_id) all = [] for i in relation: all.append(i.User_ByID) if str(file_user_id) in all: json['is_focus'] = 1 else: json['is_focus'] = 0 result = db.find({'_id': ObjectId(obj_id)}) fileType = (fileName.split('.')[-1]).lower() # 获取文件后缀名 table_name = fileName sheetList = '' sheet = '' json['file_user_id'] = file_user_id json['header'] = 'media/' + str(object.user.image) json['file_name'] = str(object.file_name) if user_id == file_user_id: json['is_me'] = 1 else: json['is_me'] = 0 json['title'] = object.detail objects = Collect.objects.filter(source=1, user=user_id, file_id=id) if objects: json['is_collect'] = 1 else: json['is_collect'] = 0 love = Love.objects.filter(user=user_id, file_id=id, source=1) if love: json['is_love'] = 1 else: json['is_love'] = 0 json['obj_id'] = object.obj_id json['fav_num'] = object.fav_num json['view_num'] = object.view_num json['thumb_num'] = object.thumb_num json['label'] = [object.label_name, ''] json['username'] = object.user.username if fileType == 'xls' or fileType == 'xlsx': # 读取excel文件 返回列表 sheetList = (sorted((result[0]['fileData']))) rel = sorted((result[0]['fileData']).items())[0][1][0:501] default_sheet = sorted((result[0]['fileData']).items())[0][0] sheet = request.GET.get('sheet') if sheet: rel = (result[0]['fileData'][sheet][0:501]) else: sheet = default_sheet elif fileType == 'csv' or fileType == 'txt': import pandas as pd # empty = pandas.DataFrame() # data = empty.append(result[0]['fileData']) # rel = data[0:].values.tolist()[0:20] data = pd.DataFrame(result[0]['fileData']) rel = data.values.tolist()[0:200] for i in range(len(rel)): for j in range(len(rel[i])): if str(rel[i][j]) == "nan": rel[i][j] = "" json['data'] = rel elif fileType == 'sql': rel = (result[0]['fileData'])[0:20] json['data'] = rel client.close() return Response(json, status=status.HTTP_200_OK) except Exception as e: print(e) return Response(status=status.HTTP_404_NOT_FOUND)
# 创建文件夹 client.makedirs(file_dir) # 返回目标信息 info = client.status(file_name, strict=False) print(info) # 写入文件(覆盖) client.write(file_name, data="hello hdfs !", overwrite=True) # 写入文件(追加) client.write(file_name, data="hello way !", overwrite=False, append=True) # 读取文件内容 with client.read(file_name, encoding='utf-8') as f: print(f.read()) # 文件下载 client.download(file_name, loacl_file_name, overwrite=True) # 文件上传 client.upload(file_name + '111', loacl_file_name, cleanup=True) # 删除文件 client.delete(file_name2) # 文件重命名 client.rename(file_name, file_name2) # 文件夹底下文件