Esempio n. 1
0
def get_data():
    client = Client("http://t3.dev:50070", "hadoop")
    # client = InsecureClient(url="http://t3.dev:50070", user="******", root="/")
    print(client.list("/huiqu/common/area.txt"))
    with client.read("/huiqu/common/area.txt/part-00000") as read:
        # print(read.read().decode('utf8'))
        return {"data": read.read().decode('utf8')}
Esempio n. 2
0
            print(rowkey)
            mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \
                         Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \
                         Mutation(column=self.columnFamily + ":user_id", value=user_id), \
                         Mutation(column=self.columnFamily + ":link", value=link)
                         ]
            # 一次提交多行
            mutations_batch.append(
                BatchMutation(row=rowkey, mutations=mutations))
            if len(mutations_batch) % batch_size == 0:
                self.client.mutateRows(self.tablename, mutations_batch)
                mutations_batch = []


if __name__ == "__main__":

    # 建立hbase连接
    hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log')
    hbasewriteer.createTable()

    # 连接HDFS
    client = Client(HDFSNN)

    # 获取文件列表
    logFiles = client.list(LOGPATH)

    # 读取文件
    for logfile in logFiles:
        with client.read(os.path.join(LOGPATH, logfile)) as deal_file_handle:
            hbasewriteer.importData(deal_file_handle)
Esempio n. 3
0
class ChatBotModel(object):
    def __init__(self,
                 hadoop_url,
                 hdfs_index_file,
                 local_index_file,
                 corpus_dir,
                 unk_answer='',
                 max_answer_len=1024):
        self.hadoop_url = hadoop_url
        self.hdfs_index_file = hdfs_index_file
        self.local_index_file = local_index_file
        self.corpus_dir = corpus_dir
        self.max_answer_len = max_answer_len
        self.unk_answer = unk_answer
        self.client = None
        self.inverted_index = {}

    def build_connection(self):
        self.client = Client(self.hadoop_url)

    def fetch_index_file(self):
        self.client.download(hdfs_path=self.hdfs_index_file,
                             local_path=self.local_index_file,
                             overwrite=True)

    def load_inverted_index(self):
        with open(self.local_index_file, 'r', encoding='utf-8') as f:
            for line in f:
                word, *querys = line.strip().split('\t')
                for query in querys:
                    file_name, query_id, score = query.split(':')
                    if word in self.inverted_index:
                        self.inverted_index[word].append(
                            [file_name, int(query_id),
                             float(score)])
                    else:
                        self.inverted_index[word] = []
                        self.inverted_index[word].append(
                            [file_name, int(query_id),
                             float(score)])

    def prepare(self):
        self.build_connection()
        self.fetch_index_file()
        self.load_inverted_index()

    def read_corpus_answer(self, file_name, query_id):
        file_path = os.path.join(self.corpus_dir, file_name)
        file_status = self.client.status(file_path)
        if file_status['length'] <= query_id:
            return None
        with self.client.read(hdfs_path=file_path,
                              offset=query_id,
                              length=self.max_answer_len,
                              encoding='utf-8') as f:
            answer = f.read().strip().split('\n')[0]
            return answer

    def predict_answer(self, query):
        words = jieba.lcut_for_search(query)
        querys = {}
        for word in words:
            if word not in self.inverted_index:
                continue
            for file_name, query_id, score in self.inverted_index[word]:
                query = (file_name, query_id)
                if query in querys:
                    querys[query] += score
                else:
                    querys[query] = score
        if len(querys) == 0:
            return self.unk_answer
        best_query = max(querys.items(), key=lambda x: x[1])
        (best_file_name, best_query_id), best_score = best_query
        best_answer = self.read_corpus_answer(best_file_name, best_query_id)
        if best_answer is None:
            best_answer = self.unk_answer
        return best_answer
Esempio n. 4
0
import pandas as pd
import os
from hdfs import Client
# 目前读取hdfs文件采用方式:
# 1. 先从hdfs读取二进制数据流文件
# 2. 将二进制文件另存为.csv
# 3. 使用pandas读取csv文件
HDFSHOST = "http://172.16.18.112:50070"
train_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19761"
test_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19762"
train_FILENAME = train_path + "/data/Data.csv"  #hdfs文件路径
test_FILENAME = test_path + "/data/Data.csv"  #hdfs文件路径
client = Client(HDFSHOST)
with client.read(train_FILENAME) as tr_s:
    tr_content = tr_s.read()
    tr_s = str(tr_content, 'utf-8')

# 确保文件写入完毕
tr_file = open("trainData.csv", "w")
tr_file.flush()
os.fsync(tr_file)
tr_file.write(tr_s)
tr_file.close()

# 读取文件
df_train = pd.read_csv("trainData.csv", header=0)
print(df_train)

with client.read(test_FILENAME) as te_fs:
    te_content = te_fs.read()
    te_s = str(te_content, 'utf-8')
Esempio n. 5
0
def train(train_path,
          test_path,
          output_path,
          target,
          train_split_ratio=0.33,
          penalty='l2',
          dual=False,
          tol=1e-4,
          C=1.0,
          random_state=None,
          multi_class='ovr'):
    # 设置起始时间
    time.localtime()
    time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format(
        y='/', m='/', d='', h=':', f=':', s=''))
    start_time = time.time()

    # 设置输入文件路径
    train_FILENAME = train_path + "/data/Data.csv"  # hdfs文件路径
    test_FILENAME = test_path + "/data/Data.csv"  # hdfs文件路径
    client = Client(HDFS_HOSTS1)
    # 训练数据读取
    with client.read(train_FILENAME) as tr_s:
        tr_content = tr_s.read()
        tr_s = str(tr_content, 'utf-8')
    # 确保文件写入完毕
    tr_file = open("trainData.csv", "w")
    tr_file.flush()
    os.fsync(tr_file)
    tr_file.write(tr_s)
    tr_file.close()
    df_train = pd.read_csv("trainData.csv", header=0)
    print(df_train)

    # 测试数据读取
    with client.read(test_FILENAME) as te_fs:
        te_content = te_fs.read()
        te_s = str(te_content, 'utf-8')
    # 确保文件写入完毕
    te_file = open("testData.csv", "w")
    te_file.flush()
    os.fsync(te_file)
    te_file.write(te_s)
    te_file.close()
    df_test = pd.read_csv("testData.csv", header=0)
    print(df_test)

    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
    test_data_num = df_train.shape[0]
    train_data_num = df_train.shape[0]

    # 处理预测集
    df_test = min_max_scaler.fit_transform(df_test)
    df_test = np.array(df_test)

    # 数据处理和清洗
    cols = [tmp_i for tmp_i in df_train.columns if tmp_i not in [target]]
    X = df_train[cols]

    X = np.array(X)
    X = min_max_scaler.fit_transform(X)
    Y = df_train[target]
    Y = np.array(Y)

    # 训练集数据分割
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=train_split_ratio)

    # 使用 scikit learn 中的LR模型进行训练
    clf = LogisticRegression(penalty,
                             dual,
                             tol,
                             C,
                             random_state,
                             multi_class,
                             solver='liblinear')
    clf.fit(X_train, Y_train)

    # 准确率train_acc
    train_acc = clf.score(X_test, Y_test)
    print('score Scikit learn: ', train_acc)
    # 精确率train_precision_score
    train_precision_score = precision_score(Y_test, clf.predict(X_test))
    # 召回率train_recall_score
    train_recall_score = recall_score(Y_test, clf.predict(X_test))
    # F1_Score
    train_f1_score = f1_score(Y_test, clf.predict(X_test))
    # roc_auc_score
    train_roc_auc_score1 = roc_auc_score(Y_test, clf.predict(X_test))

    # 使用 scikit learn 中的LR模型进行预测
    result = clf.predict(df_test)
    # print(result)

    # 设置终止时间,并计算总时间
    train_end = time.time()
    train_seconds = train_end - start_time
    m, s = divmod(train_seconds, 60)
    h, m = divmod(m, 60)
    time_trains_all = "%02d:%02d:%02d" % (h, m, s)

    # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++#
    ## 保存摘要模型报告文件
    # abstract_path = HDFS_HOSTS1 + output_path + '/abstract/data/'
    abstract_path = output_path + '/abstract/data/'
    f = open('abstract.csv', mode='w', newline='')
    fileheader = [
        'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start',
        'time_trains_all', 'test_data_num', 'train_data_num'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.FrameWork = 'Scikit-learn'
    csv_dict.Version = sklearn.__version__
    csv_dict.model = '%s' % LogisticRegression
    csv_dict.accuracy = str(train_acc)
    csv_dict.time_trains_start = time_trains_start
    csv_dict.time_trains_all = time_trains_all
    csv_dict.test_data_num = str(test_data_num)
    csv_dict.train_data_num = str(train_data_num)
    w.writerow(csv_dict)
    f.close()
    client.delete(abstract_path + 'abstract.csv')
    client.upload(abstract_path + 'abstract.csv', 'abstract.csv')
    # if len(client.list(abstract_path)):
    # 	client.delete(abstract_path + 'abstract.csv')
    # 	client.upload(abstract_path + 'abstract.csv', 'abstract.csv')
    # else:
    # 	client.upload(abstract_path + 'abstract.csv', 'abstract.csv')

    ##保存模型版本信息csv文件
    version_path = output_path + '/msg/data/'
    f = open('msg.csv', mode='w', newline='')
    fileheader = [
        'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num',
        'train_data_num'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.accuracy = str(train_acc)
    csv_dict.time_trains_start = time_trains_start
    csv_dict.time_trains_all = time_trains_all
    csv_dict.test_data_num = str(test_data_num)
    csv_dict.train_data_num = str(train_data_num)
    w.writerow(csv_dict)
    f.close()
    client.delete(version_path + 'msg.csv')
    client.upload(version_path + 'msg.csv', 'msg.csv')

    ## 保存训练评价指标模型报告文件
    file_csv_path = output_path + '/evaluation/data/'
    f = open('evaluation.csv', mode='w', newline='')
    fileheader = [
        'accuracy', 'train_precision_score', 'train_recall_score',
        'train_f1_score', 'train_roc_auc_score1'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.accuracy = str(train_acc)
    csv_dict.train_precision_score = train_precision_score
    csv_dict.train_recall_score = train_recall_score
    csv_dict.train_f1_score = train_f1_score
    csv_dict.train_roc_auc_score1 = train_roc_auc_score1
    w.writerow(csv_dict)
    f.close()
    client.delete(file_csv_path + 'evaluation.csv')
    client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv')

    # 保存测试集预测结果文件
    file_csv_path = output_path + '/result/data/'

    # 字典中的key值即为csv中列名
    dataframe = pd.DataFrame({target: result})
    # 将DataFrame存储为csv,index表示是否显示行名,default=True
    dataframe.to_csv("result.csv", index=False, sep=',')

    client.delete(file_csv_path + 'result.csv')
    client.upload(file_csv_path + 'result.csv', 'result.csv')
Esempio n. 6
0
        rowkey = md5(str(user_id) + str(visitTime))

        print(rowkey)

        mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \
                     Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \
                     Mutation(column=self.columnFamily + ":user_id", value=user_id), \
                     Mutation(column=self.columnFamily + ":link", value=link)
                     ]
        self.client.mutateRow(self.tablename, rowkey, mutations)


if __name__ == "__main__":

    # 建立hbase连接
    hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log')
    hbasewriteer.createTable()

    # 连接HDFS
    client = Client(HDFSNN, timeout=200000)

    # 获取文件列表
    logFiles = client.list(LOGPATH)

    # 读取文件
    for logfile in logFiles:
        with client.read(LOGPATH + logfile) as fp:
            for line in fp:
                record = line.split(" ")
                hbasewriteer.importData(record)
def interface(train_path,
              test_path,
              output_path,
              target,
              chaid_ratio,
              train_split_ratio=0.3,
              n_estimators=100,
              max_depth=5,
              min_samples_split=3,
              min_samples_leaf=2,
              min_split_gain=0.0,
              colsample_bytree="log2",
              subsample=0.8,
              random_state=100):
    # 设置起始时间
    time.localtime()
    time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format(
        y='/', m='/', d='', h=':', f=':', s=''))
    start_time = time.time()

    # 设置输入文件路径
    train_FILENAME = train_path + "/data/Data.csv"  # hdfs文件路径
    test_FILENAME = test_path + "/data/Data.csv"  # hdfs文件路径
    client = Client(HDFS_HOSTS1)
    # 训练数据读取
    with client.read(train_FILENAME) as tr_s:
        tr_content = tr_s.read()
        tr_s = str(tr_content, 'utf-8')
    # 确保文件写入完毕
    tr_file = open("trainData.csv", "w")
    tr_file.flush()
    os.fsync(tr_file)
    tr_file.write(tr_s)
    tr_file.close()
    df_train = pd.read_csv("trainData.csv", header=0)
    print(df_train)

    # 测试数据读取
    with client.read(test_FILENAME) as te_fs:
        te_content = te_fs.read()
        te_s = str(te_content, 'utf-8')
    # 确保文件写入完毕
    te_file = open("testData.csv", "w")
    te_file.flush()
    os.fsync(te_file)
    te_file.write(te_s)
    te_file.close()
    df_test = pd.read_csv("testData.csv", header=0)
    print(df_test)

    test_data_num = df_train.shape[0]
    train_data_num = df_train.shape[0]

    # 卡方检测选出和label列最相关的前chaid_ratio(默认值为前80%)的列
    ch2 = SelectKBest(chi2, k=int(df_train.shape[1] * chaid_ratio))
    chi_df_train = pd.DataFrame(ch2.fit_transform(df_train, df_train[target]))

    label_df = df_train[target]
    # wine数据集 和 sonar 数据集
    clf = RandomForestClassifier(n_estimators, max_depth, min_samples_split,
                                 min_samples_leaf, min_split_gain,
                                 colsample_bytree, subsample, random_state)

    # 数据集分割与训练
    train_count = int(train_split_ratio * len(chi_df_train))
    clf.fit(chi_df_train.ix[:train_count], label_df.ix[:train_count])
    train_acc = metrics.accuracy_score(
        label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count]))
    print("模型的准确率:", train_acc)
    # 精确率
    train_precision_score = metrics.precision_score(
        label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count]))
    # 召回率
    train_recall_score = metrics.recall_score(
        label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count]))
    # F1_Score
    train_f1_score = metrics.f1_score(
        label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count]))
    # roc_auc_score
    train_roc_auc_score1 = metrics.roc_auc_score(
        label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count]))

    # 对测试集进行处理,保证其和训练集卡方检测后的列数一致
    ch2_list = list(ch2.get_support())
    ch2_list.pop()
    df_test_head = list(df_test.columns)
    for x, y in zip(ch2_list, df_test_head):
        if x == False:
            df_test_head.remove(y)
    df_test = df_test[df_test_head]
    # 预测
    result = clf.predict(df_test)
    # print(result)

    # 设置终止时间,并计算总时间
    train_end = time.time()
    train_seconds = train_end - start_time
    m, s = divmod(train_seconds, 60)
    h, m = divmod(m, 60)
    time_trains_all = "%02d:%02d:%02d" % (h, m, s)
    # print(time_trains_start,time_trains_all)

    # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++#
    ## 保存摘要模型报告文件
    abstract_path = output_path + '/abstract/data/'
    f = open('abstract.csv', mode='w', newline='')
    fileheader = [
        'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start',
        'time_trains_all', 'test_data_num', 'train_data_num'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.FrameWork = 'Scikit-learn'
    csv_dict.Version = sklearn.__version__
    csv_dict.model = '%s' % RandomForestClassifier
    csv_dict.accuracy = str(train_acc)
    csv_dict.time_trains_start = time_trains_start
    csv_dict.time_trains_all = time_trains_all
    csv_dict.test_data_num = str(test_data_num)
    csv_dict.train_data_num = str(train_data_num)
    w.writerow(csv_dict)
    f.close()
    client.delete(abstract_path + 'abstract.csv')
    client.upload(abstract_path + 'abstract.csv', 'abstract.csv')

    ##保存模型版本信息csv文件
    version_path = output_path + '/msg/data/'
    f = open('msg.csv', mode='w', newline='')
    fileheader = [
        'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num',
        'train_data_num'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.accuracy = str(train_acc)
    csv_dict.time_trains_start = time_trains_start
    csv_dict.time_trains_all = time_trains_all
    csv_dict.test_data_num = str(test_data_num)
    csv_dict.train_data_num = str(train_data_num)
    w.writerow(csv_dict)
    f.close()
    client.delete(version_path + 'msg.csv')
    client.upload(version_path + 'msg.csv', 'msg.csv')

    ## 保存训练评价指标模型报告文件
    file_csv_path = output_path + '/evaluation/data/'
    f = open('evaluation.csv', mode='w', newline='')
    fileheader = [
        'accuracy', 'train_precision_score', 'train_recall_score',
        'train_f1_score', 'train_roc_auc_score1'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.accuracy = str(train_acc)
    csv_dict.train_precision_score = train_precision_score
    csv_dict.train_recall_score = train_recall_score
    csv_dict.train_f1_score = train_f1_score
    csv_dict.train_roc_auc_score1 = train_roc_auc_score1
    w.writerow(csv_dict)
    f.close()
    client.delete(file_csv_path + 'evaluation.csv')
    client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv')

    # 保存测试集预测结果文件
    file_csv_path = output_path + '/result/data/'
    dataframe = pd.DataFrame({target: result})
    dataframe.to_csv("result.csv", index=False, sep=',')
    client.delete(file_csv_path + 'result.csv')
    client.upload(file_csv_path + 'result.csv', 'result.csv')
Esempio n. 8
0
    def post(self, request, *args, **kwargs):
        import uuid
        permission_classes = (IsAuthenticated,)
        start_time = time.time()
        file_serializer = DatasourceSerializer(data=request.data)

        if file_serializer.is_valid():
            path = file_serializer.validated_data['file_name']
            user = request.user.id
            # 上传文件的大小
            filesize = round((path.size) / 1024 / 1024, 2)

            #  获取该用户所有文件的大小
            mydata_id = DataSource.objects.filter(user_id=user)
            myData_size = 0
            for i in mydata_id:
                try:
                    x = i.fileSize.replace("KB", '')
                    myData_size += float(x)
                except:
                    continue
            myData_size = round(myData_size / 1024, 2)  # 单位MB

            #  该用户即将上传文件加本身有的大小
            now_userDataSize = filesize + myData_size

            #  查找用户所限制文件上传容量的大小
            user_dataSize_old = UserLevel.objects.get(user_id=user).up_load

            print(type(user_dataSize_old))
            if now_userDataSize > user_dataSize_old:
                return Response({'msg': '您的数据容量不足,请清理数据后在尝试', 'status': False})
            # if 1 > 2:
            #     pass
            else:
                try:
                    is_header = file_serializer.validated_data['is_header']
                    # user =1
                    separator = file_serializer.validated_data['column_delimiter']
                except:
                    # 数据库文件没有表头,所以设置
                    is_header = ''
                    separator = '\n'
                last = (str(path).lower()).split('.')[-1].upper()
                if last == 'CSV' or last == 'TXT' or last == 'SQL':
                    if path.size > LIMIT_FILE_SIZE:
                        format_name = uuid.uuid1()
                        file_serializer.validated_data['format_filename'] = format_name
                        file_serializer.save()
                        client = Client(HDFS_HOST)
                        file_path = os.path.join(settings.MEDIA_ROOT, str(path))
                        with open(file_path, 'rb') as f1:  # 判断文件的编码
                            data_type = chardet.detect(f1.readline())['encoding']
                        if data_type == 'None':
                            return Response({'msg': '数据格式有误', 'status': False})
                        os.renames(file_path, os.path.join(settings.MEDIA_ROOT, str(format_name)))
                        client.upload("/datahoop", os.path.join(settings.MEDIA_ROOT, str(format_name)), n_threads=4)
                        os.remove(os.path.join(settings.MEDIA_ROOT, str(format_name)))
                        try:
                            with client.read('/datahoop/' + str(format_name), encoding=data_type) as reader:
                                filesize = ((client.status('/datahoop/' + str(format_name)))['length']) / 1024
                                filesize = str(round(filesize, 2)) + 'KB'
                                reader = reader.readlines()
                        except:
                            return Response({'msg': '数据读取失败', 'status': False})
                        column_delimiter = separator
                        if is_header == 1:
                            title = (reader[0]).split(column_delimiter)
                            json = {}
                            s = ((reader[0]).split(column_delimiter))
                            for i in s:
                                json[i.replace('\r\n', '')] = [typ.StringType, True]
                            print(json)
                        else:
                            total = len((reader[0]).split(column_delimiter))
                            title = []
                            for i in range(total):
                                title.append('_C' + str(i))
                            json = {}
                            for i in title:
                                json[i] = [typ.StringType, True]

                        column_num = len((reader[0]).split(column_delimiter))
                        row_num = len(reader)
                        DataSource.objects.filter(format_filename=format_name).update(user_id=user,
                                                                                      title=title[:20],
                                                                                      fileSize=filesize,
                                                                                      where='hdfs', row_num=row_num,
                                                                                      column_num=column_num)
                        over_time = time.time()
                        print('ID为<%s>用户--数据上传<%s>文件的时间为--<%s>秒' % (user, format_name, over_time - start_time))
                        return Response({'msg': '数据存储成功', 'status': True})
                    else:
                        global object_id
                        filePath = os.path.join(settings.MEDIA_ROOT, str(path))
                        file_serializer.save()
                        filesize = str(round((path.size) / 1024, 2)) + 'KB'
                        if last == 'XLS' or last == 'XLSX':
                            pass
                        elif last == 'TXT':
                            object_id = tools.save_mongo_txt(filePath, user, is_header, separator, str(path))
                            if object_id != 'none':
                                file_serializer.validated_data['obj_id'] = object_id
                                file_serializer.validated_data['file_name'] = str(path)
                                file_serializer.save()
                            else:
                                DataSource.objects.filter(file_name=str(path), user=1).delete()
                                os.remove(os.path.join(settings.MEDIA_ROOT, str(path)))
                                return Response({'msg': '数据格式有误', 'status': False})
                        elif last == 'CSV':
                            object_id = tools.save_mongo_csv(filePath, user, is_header, separator, str(path))
                            if object_id != 'none':
                                file_serializer.validated_data['obj_id'] = object_id
                                file_serializer.validated_data['file_name'] = str(path)
                                file_serializer.save()
                            else:
                                # uuid = uuid.uuid1()
                                # file_serializer.validated_data['obj_id'] = uuid
                                # file_serializer.validated_data['file_name'] = str(path)
                                # file_serializer.save()

                                DataSource.objects.filter(file_name=str(path), user=1).delete()
                                os.remove(os.path.join(settings.MEDIA_ROOT, str(path)))
                                return Response({'msg': '数据格式有误', 'status': False})
                        elif last == 'SQL':
                            try:
                                object_id = tools.save_mongo_sql(filePath, user)
                                file_serializer.validated_data['obj_id'] = object_id
                                file_serializer.validated_data['file_name'] = str(path)
                                file_serializer.save()
                            except Exception as e:
                                DataSource.objects.filter(file_name=str(path), user=1).delete()
                                os.remove(os.path.join(settings.MEDIA_ROOT, str(path)))
                                return Response({'msg': '数据格式有误', 'status': False})
                        with open(filePath, 'rb') as f1:  # 判断文件的编码
                            data_type = chardet.detect(f1.readline())['encoding']
                        with open(filePath, encoding=data_type, errors='ignore') as reader:  # 按编码读文件
                            reader = reader.readlines()
                        if is_header == 1:
                            title = (reader[0]).split(separator)
                            json = {}
                            s = ((reader[0]).split(separator))
                            for i in s:
                                json[i.replace('\r\n', '')] = [typ.StringType, True]
                            column_num = len((reader[0]).split(separator))
                        else:
                            if last != 'SQL':
                                total = len((reader[0]).split(separator))
                                title = []
                                for i in range(total):
                                    title.append('_C' + str(i))
                                json = {}
                                for i in title:
                                    json[i] = [typ.StringType, True]
                                column_num = len((reader[0]).split(separator))
                            else:
                                total = re.findall(r'[^()]+', reader[0])[1].split(',')
                                title = []
                                for i in range(len(total)):
                                    title.append('_C' + str(i))
                                json = {}
                                for i in title:
                                    json[i] = [typ.StringType, True]
                                column_num = len(total)
                        row_num = len(reader)
                        DataSource.objects.filter(obj_id=object_id).update(user_id=user, title=title[:20],
                                                                           fileSize=filesize, where='mongodb',
                                                                           row_num=row_num, column_num=column_num)
                        os.remove(os.path.join(settings.MEDIA_ROOT, str(path)))
                        over_time = time.time()
                        print('ID为<%s>用户--数据上传<%s>文件的时间为--<%s>秒' % (user, path, over_time - start_time))
                        return Response({'msg': '数据存储成功', 'status': True})
                else:
                    return Response({'msg': '暂不支持此类文件上传', 'status': False})
        else:
            return Response({'msg': '不是一个有效的数据', 'status': False})
Esempio n. 9
0
    def get(self, request, detail_id):
        id = detail_id
        view_num = DataSource.objects.get(id=id).view_num
        DataSource.objects.filter(id=id).update(view_num=view_num + 1)
        user_id = request.user.id
        object = DataSource.objects.get(id=id)
        where = object.obj_id
        if where == '':
            try:
                file = object.format_filename
                file = str(file)
                header = 'media/' + str(object.user.image)
                username = object.user.username
                hdfs_path = '/datahoop/' + file
                client = Client(HDFS_HOST)
                json = {}
                json['header'] = header

                file_user_id = object.user.id
                relation = Relationship.objects.filter(author_id=user_id)
                all = []
                for i in relation:
                    all.append(i.User_ByID)
                if str(file_user_id) in all:
                    json['is_focus'] = 1
                else:
                    json['is_focus'] = 0

                objects = Collect.objects.filter(source=1, user=user_id, file_id=id)
                if objects:
                    json['is_collect'] = 1
                else:
                    json['is_collect'] = 0
                love = Love.objects.filter(user=user_id, file_id=id, source=1)
                if love:
                    json['is_love'] = 1
                else:
                    json['is_love'] = 0
                file_user_id = object.user_id
                if user_id == file_user_id:
                    json['is_me'] = 1
                else:
                    json['is_me'] = 0
                json['file_name'] = str(object.file_name)
                json['title'] = object.detail

                json['hdfs'] = object.format_filename
                json['fav_num'] = object.fav_num
                json['view_num'] = object.view_num
                json['thumb_num'] = object.thumb_num
                json['label'] = [object.label_name, '']
                json['username'] = username
                json['data'] = []
                with client.read(hdfs_path, encoding='utf-8') as reader:
                    for i in (reader.readlines())[0:20]:
                        print(i)
                        json['data'].append(i.split(','))
                return Response(json, status=status.HTTP_200_OK)
            except Exception as  e:
                print(e)
                return Response(status=status.HTTP_404_NOT_FOUND)
        else:
            try:
                client = pymongo.MongoClient(MONGO_DB_URI)
                db = client.datahoop.data
                json = {}
                fileName = str(object.file_name)
                obj_id = object.obj_id
                file_user_id = object.user.id
                relation = Relationship.objects.filter(author_id=user_id)
                all = []
                for i in relation:
                    all.append(i.User_ByID)
                if str(file_user_id) in all:
                    json['is_focus'] = 1
                else:
                    json['is_focus'] = 0

                result = db.find({'_id': ObjectId(obj_id)})
                fileType = (fileName.split('.')[-1]).lower()  # 获取文件后缀名
                table_name = fileName
                sheetList = ''
                sheet = ''
                json['file_user_id'] = file_user_id
                json['header'] = 'media/' + str(object.user.image)
                json['file_name'] = str(object.file_name)
                if user_id == file_user_id:
                    json['is_me'] = 1
                else:
                    json['is_me'] = 0
                json['title'] = object.detail
                objects = Collect.objects.filter(source=1, user=user_id, file_id=id)
                if objects:
                    json['is_collect'] = 1
                else:
                    json['is_collect'] = 0
                love = Love.objects.filter(user=user_id, file_id=id, source=1)
                if love:
                    json['is_love'] = 1
                else:
                    json['is_love'] = 0
                json['obj_id'] = object.obj_id
                json['fav_num'] = object.fav_num
                json['view_num'] = object.view_num
                json['thumb_num'] = object.thumb_num
                json['label'] = [object.label_name, '']
                json['username'] = object.user.username
                if fileType == 'xls' or fileType == 'xlsx':  # 读取excel文件 返回列表
                    sheetList = (sorted((result[0]['fileData'])))
                    rel = sorted((result[0]['fileData']).items())[0][1][0:501]
                    default_sheet = sorted((result[0]['fileData']).items())[0][0]
                    sheet = request.GET.get('sheet')
                    if sheet:
                        rel = (result[0]['fileData'][sheet][0:501])
                    else:
                        sheet = default_sheet
                elif fileType == 'csv' or fileType == 'txt':
                    import pandas as pd
                    # empty = pandas.DataFrame()
                    # data = empty.append(result[0]['fileData'])
                    # rel = data[0:].values.tolist()[0:20]
                    data = pd.DataFrame(result[0]['fileData'])
                    rel = data.values.tolist()[0:200]
                    for i in range(len(rel)):
                        for j in range(len(rel[i])):
                            if str(rel[i][j]) == "nan":
                                rel[i][j] = ""
                    json['data'] = rel
                elif fileType == 'sql':
                    rel = (result[0]['fileData'])[0:20]
                    json['data'] = rel
                client.close()
                return Response(json, status=status.HTTP_200_OK)
            except Exception as e:
                print(e)
                return Response(status=status.HTTP_404_NOT_FOUND)
Esempio n. 10
0
# 创建文件夹
client.makedirs(file_dir)

# 返回目标信息
info = client.status(file_name, strict=False)
print(info)

# 写入文件(覆盖)
client.write(file_name, data="hello hdfs !", overwrite=True)

# 写入文件(追加)
client.write(file_name, data="hello way !", overwrite=False, append=True)

# 读取文件内容
with client.read(file_name, encoding='utf-8') as f:
    print(f.read())

# 文件下载
client.download(file_name, loacl_file_name, overwrite=True)

# 文件上传
client.upload(file_name + '111', loacl_file_name, cleanup=True)

# 删除文件
client.delete(file_name2)

# 文件重命名
client.rename(file_name, file_name2)

# 文件夹底下文件