Example #1
0
class HdfsPipeline(object):
    def __init__(self, **kwargs):
        self.table_cols_map = {}  # 表字段顺序 {table:(cols, col_default)}
        self.bizdate = bizdate  # 业务日期为启动爬虫的日期
        self.buckets_map = {}  # 桶 {table:items}
        self.bucketsize = kwargs.get('BUCKETSIZE')
        self.client = Client(kwargs.get('HDFS_URLS'))
        self.dir = kwargs.get('HDFS_FOLDER')  # 文件夹路径
        self.delimiter = kwargs.get('HDFS_DELIMITER')  # 列分隔符,默认 hive默认分隔符
        self.encoding = kwargs.get('HDFS_ENCODING')  # 文件编码,默认 'utf-8'
        self.hive_host = kwargs.get('HIVE_HOST')
        self.hive_port = kwargs.get('HIVE_PORT')
        self.hive_dbname = kwargs.get('HIVE_DBNAME')  # 数据库名称
        self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE',
                                           False)  # hive 是否自动建表,默认 False
        self.client.makedirs(self.dir)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(**settings)

    def process_item(self, item, spider):
        """
        :param item:
        :param spider:
        :return: 数据分表入库
        """
        if item.tablename in self.buckets_map:
            self.buckets_map[item.tablename].append(item)
        else:
            cols, col_default = [], {}
            for field, value in item.fields.items():
                cols.append(field)
                col_default[field] = item.fields[field].get('default', '')
            cols.sort(key=lambda x: item.fields[x].get('idx', 1))
            self.table_cols_map.setdefault(
                item.tablename, (cols, col_default))  # 定义表结构、字段顺序、默认值
            self.buckets_map.setdefault(item.tablename, [item])
            if self.hive_auto_create:
                self.checktable(item.tablename, cols)  # 建表
        self.buckets2db(bucketsize=self.bucketsize,
                        spider_name=spider.name)  # 将满足条件的桶 入库
        return item

    def close_spider(self, spider):
        """
        :param spider:
        :return:  爬虫结束时,将桶里面剩下的数据 入库
        """
        self.buckets2db(bucketsize=1, spider_name=spider.name)

    def checktable(self, tbname, cols):
        """
        :return: 创建 hive 表
        """
        hive = CtrlHive(self.hive_host, self.hive_port, self.hive_dbname)
        cols = ['keyid'] + cols + ['bizdate', 'ctime', 'spider']
        create_sql = f"create table if not exists {tbname}({' string,'.join(cols)} string)"
        hive.execute(create_sql)
        logger.info(f"表创建成功 <= 表名:{tbname}")

    def buckets2db(self, bucketsize=100, spider_name=''):
        """
        :param bucketsize:  桶大小
        :param spider_name:  爬虫名字
        :return: 遍历每个桶,将满足条件的桶,入库并清空桶
        """
        for tablename, items in self.buckets_map.items(
        ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
            if len(items) >= bucketsize:
                new_items = []
                cols, col_default = self.table_cols_map.get(tablename)
                for item in items:
                    keyid = rowkey()
                    new_item = {'keyid': keyid}
                    for field in cols:
                        value = item.get(field, col_default.get(field))
                        new_item[field] = str(value).replace(
                            self.delimiter, '').replace('\n', '')
                    new_item['bizdate'] = self.bizdate  # 增加非业务字段
                    new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime())
                    new_item['spider'] = spider_name
                    value = self.delimiter.join(new_item.values())
                    new_items.append(value)

                # 每张表是都是一个文件夹
                folder = f"{self.dir}/{tablename}"
                self.client.makedirs(folder)

                filename = f"{folder}/data.txt"
                info = self.client.status(filename, strict=False)
                if not info:
                    self.client.write(filename,
                                      data='',
                                      overwrite=True,
                                      encoding=self.encoding)

                try:
                    content = '\n'.join(new_items) + '\n'
                    self.client.write(filename,
                                      data=content,
                                      overwrite=False,
                                      append=True,
                                      encoding=self.encoding)
                    logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}")
                    items.clear()  # 清空桶
                except Exception as e:
                    logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")
Example #2
0
class ChatBotModel(object):
    def __init__(self,
                 hadoop_url,
                 hdfs_index_file,
                 local_index_file,
                 corpus_dir,
                 unk_answer='',
                 max_answer_len=1024):
        self.hadoop_url = hadoop_url
        self.hdfs_index_file = hdfs_index_file
        self.local_index_file = local_index_file
        self.corpus_dir = corpus_dir
        self.max_answer_len = max_answer_len
        self.unk_answer = unk_answer
        self.client = None
        self.inverted_index = {}

    def build_connection(self):
        self.client = Client(self.hadoop_url)

    def fetch_index_file(self):
        self.client.download(hdfs_path=self.hdfs_index_file,
                             local_path=self.local_index_file,
                             overwrite=True)

    def load_inverted_index(self):
        with open(self.local_index_file, 'r', encoding='utf-8') as f:
            for line in f:
                word, *querys = line.strip().split('\t')
                for query in querys:
                    file_name, query_id, score = query.split(':')
                    if word in self.inverted_index:
                        self.inverted_index[word].append(
                            [file_name, int(query_id),
                             float(score)])
                    else:
                        self.inverted_index[word] = []
                        self.inverted_index[word].append(
                            [file_name, int(query_id),
                             float(score)])

    def prepare(self):
        self.build_connection()
        self.fetch_index_file()
        self.load_inverted_index()

    def read_corpus_answer(self, file_name, query_id):
        file_path = os.path.join(self.corpus_dir, file_name)
        file_status = self.client.status(file_path)
        if file_status['length'] <= query_id:
            return None
        with self.client.read(hdfs_path=file_path,
                              offset=query_id,
                              length=self.max_answer_len,
                              encoding='utf-8') as f:
            answer = f.read().strip().split('\n')[0]
            return answer

    def predict_answer(self, query):
        words = jieba.lcut_for_search(query)
        querys = {}
        for word in words:
            if word not in self.inverted_index:
                continue
            for file_name, query_id, score in self.inverted_index[word]:
                query = (file_name, query_id)
                if query in querys:
                    querys[query] += score
                else:
                    querys[query] = score
        if len(querys) == 0:
            return self.unk_answer
        best_query = max(querys.items(), key=lambda x: x[1])
        (best_file_name, best_query_id), best_score = best_query
        best_answer = self.read_corpus_answer(best_file_name, best_query_id)
        if best_answer is None:
            best_answer = self.unk_answer
        return best_answer
Example #3
0
    def post(self, request, *args, **kwargs):
        import uuid
        permission_classes = (IsAuthenticated,)
        start_time = time.time()
        file_serializer = DatasourceSerializer(data=request.data)

        if file_serializer.is_valid():
            path = file_serializer.validated_data['file_name']
            user = request.user.id
            # 上传文件的大小
            filesize = round((path.size) / 1024 / 1024, 2)

            #  获取该用户所有文件的大小
            mydata_id = DataSource.objects.filter(user_id=user)
            myData_size = 0
            for i in mydata_id:
                try:
                    x = i.fileSize.replace("KB", '')
                    myData_size += float(x)
                except:
                    continue
            myData_size = round(myData_size / 1024, 2)  # 单位MB

            #  该用户即将上传文件加本身有的大小
            now_userDataSize = filesize + myData_size

            #  查找用户所限制文件上传容量的大小
            user_dataSize_old = UserLevel.objects.get(user_id=user).up_load

            print(type(user_dataSize_old))
            if now_userDataSize > user_dataSize_old:
                return Response({'msg': '您的数据容量不足,请清理数据后在尝试', 'status': False})
            # if 1 > 2:
            #     pass
            else:
                try:
                    is_header = file_serializer.validated_data['is_header']
                    # user =1
                    separator = file_serializer.validated_data['column_delimiter']
                except:
                    # 数据库文件没有表头,所以设置
                    is_header = ''
                    separator = '\n'
                last = (str(path).lower()).split('.')[-1].upper()
                if last == 'CSV' or last == 'TXT' or last == 'SQL':
                    if path.size > LIMIT_FILE_SIZE:
                        format_name = uuid.uuid1()
                        file_serializer.validated_data['format_filename'] = format_name
                        file_serializer.save()
                        client = Client(HDFS_HOST)
                        file_path = os.path.join(settings.MEDIA_ROOT, str(path))
                        with open(file_path, 'rb') as f1:  # 判断文件的编码
                            data_type = chardet.detect(f1.readline())['encoding']
                        if data_type == 'None':
                            return Response({'msg': '数据格式有误', 'status': False})
                        os.renames(file_path, os.path.join(settings.MEDIA_ROOT, str(format_name)))
                        client.upload("/datahoop", os.path.join(settings.MEDIA_ROOT, str(format_name)), n_threads=4)
                        os.remove(os.path.join(settings.MEDIA_ROOT, str(format_name)))
                        try:
                            with client.read('/datahoop/' + str(format_name), encoding=data_type) as reader:
                                filesize = ((client.status('/datahoop/' + str(format_name)))['length']) / 1024
                                filesize = str(round(filesize, 2)) + 'KB'
                                reader = reader.readlines()
                        except:
                            return Response({'msg': '数据读取失败', 'status': False})
                        column_delimiter = separator
                        if is_header == 1:
                            title = (reader[0]).split(column_delimiter)
                            json = {}
                            s = ((reader[0]).split(column_delimiter))
                            for i in s:
                                json[i.replace('\r\n', '')] = [typ.StringType, True]
                            print(json)
                        else:
                            total = len((reader[0]).split(column_delimiter))
                            title = []
                            for i in range(total):
                                title.append('_C' + str(i))
                            json = {}
                            for i in title:
                                json[i] = [typ.StringType, True]

                        column_num = len((reader[0]).split(column_delimiter))
                        row_num = len(reader)
                        DataSource.objects.filter(format_filename=format_name).update(user_id=user,
                                                                                      title=title[:20],
                                                                                      fileSize=filesize,
                                                                                      where='hdfs', row_num=row_num,
                                                                                      column_num=column_num)
                        over_time = time.time()
                        print('ID为<%s>用户--数据上传<%s>文件的时间为--<%s>秒' % (user, format_name, over_time - start_time))
                        return Response({'msg': '数据存储成功', 'status': True})
                    else:
                        global object_id
                        filePath = os.path.join(settings.MEDIA_ROOT, str(path))
                        file_serializer.save()
                        filesize = str(round((path.size) / 1024, 2)) + 'KB'
                        if last == 'XLS' or last == 'XLSX':
                            pass
                        elif last == 'TXT':
                            object_id = tools.save_mongo_txt(filePath, user, is_header, separator, str(path))
                            if object_id != 'none':
                                file_serializer.validated_data['obj_id'] = object_id
                                file_serializer.validated_data['file_name'] = str(path)
                                file_serializer.save()
                            else:
                                DataSource.objects.filter(file_name=str(path), user=1).delete()
                                os.remove(os.path.join(settings.MEDIA_ROOT, str(path)))
                                return Response({'msg': '数据格式有误', 'status': False})
                        elif last == 'CSV':
                            object_id = tools.save_mongo_csv(filePath, user, is_header, separator, str(path))
                            if object_id != 'none':
                                file_serializer.validated_data['obj_id'] = object_id
                                file_serializer.validated_data['file_name'] = str(path)
                                file_serializer.save()
                            else:
                                # uuid = uuid.uuid1()
                                # file_serializer.validated_data['obj_id'] = uuid
                                # file_serializer.validated_data['file_name'] = str(path)
                                # file_serializer.save()

                                DataSource.objects.filter(file_name=str(path), user=1).delete()
                                os.remove(os.path.join(settings.MEDIA_ROOT, str(path)))
                                return Response({'msg': '数据格式有误', 'status': False})
                        elif last == 'SQL':
                            try:
                                object_id = tools.save_mongo_sql(filePath, user)
                                file_serializer.validated_data['obj_id'] = object_id
                                file_serializer.validated_data['file_name'] = str(path)
                                file_serializer.save()
                            except Exception as e:
                                DataSource.objects.filter(file_name=str(path), user=1).delete()
                                os.remove(os.path.join(settings.MEDIA_ROOT, str(path)))
                                return Response({'msg': '数据格式有误', 'status': False})
                        with open(filePath, 'rb') as f1:  # 判断文件的编码
                            data_type = chardet.detect(f1.readline())['encoding']
                        with open(filePath, encoding=data_type, errors='ignore') as reader:  # 按编码读文件
                            reader = reader.readlines()
                        if is_header == 1:
                            title = (reader[0]).split(separator)
                            json = {}
                            s = ((reader[0]).split(separator))
                            for i in s:
                                json[i.replace('\r\n', '')] = [typ.StringType, True]
                            column_num = len((reader[0]).split(separator))
                        else:
                            if last != 'SQL':
                                total = len((reader[0]).split(separator))
                                title = []
                                for i in range(total):
                                    title.append('_C' + str(i))
                                json = {}
                                for i in title:
                                    json[i] = [typ.StringType, True]
                                column_num = len((reader[0]).split(separator))
                            else:
                                total = re.findall(r'[^()]+', reader[0])[1].split(',')
                                title = []
                                for i in range(len(total)):
                                    title.append('_C' + str(i))
                                json = {}
                                for i in title:
                                    json[i] = [typ.StringType, True]
                                column_num = len(total)
                        row_num = len(reader)
                        DataSource.objects.filter(obj_id=object_id).update(user_id=user, title=title[:20],
                                                                           fileSize=filesize, where='mongodb',
                                                                           row_num=row_num, column_num=column_num)
                        os.remove(os.path.join(settings.MEDIA_ROOT, str(path)))
                        over_time = time.time()
                        print('ID为<%s>用户--数据上传<%s>文件的时间为--<%s>秒' % (user, path, over_time - start_time))
                        return Response({'msg': '数据存储成功', 'status': True})
                else:
                    return Response({'msg': '暂不支持此类文件上传', 'status': False})
        else:
            return Response({'msg': '不是一个有效的数据', 'status': False})
Example #4
0
def is_hdfs_directory(hdfs_client: Client, path: str):
    return hdfs_client.status(path)["type"] == "DIRECTORY"
Example #5
0
from hdfs import Client

HDFS_ClIENT = "http://172.16.122.21:50070;http://172.16.122.24:50070"

file_dir = '/tmp/way'
file_name = '/tmp/way/test.txt'
file_name2 = '/tmp/way/test123.txt'
loacl_file_name = 'test.txt'

client = Client(HDFS_ClIENT)

# 创建文件夹
client.makedirs(file_dir)

# 返回目标信息
info = client.status(file_name, strict=False)
print(info)

# 写入文件(覆盖)
client.write(file_name, data="hello hdfs !", overwrite=True)

# 写入文件(追加)
client.write(file_name, data="hello way !", overwrite=False, append=True)

# 读取文件内容
with client.read(file_name, encoding='utf-8') as f:
    print(f.read())

# 文件下载
client.download(file_name, loacl_file_name, overwrite=True)
Example #6
0
class RF_HDFS(object):
    def __init__(self):
        self.client = None
        self.directory = None

    def connect_and_login(self, **kwargs):
        import requests

        host = None
        port = None
        user = None
        password = None
        root = None
        timeout = None
        proxy = None

        if 'host' in kwargs:
            host = kwargs['host']
        if 'port' in kwargs:
            port = kwargs['port']
        if 'kdc' in kwargs:
            kdc = kwargs['kdc']
        if 'user' in kwargs:
            user = kwargs['user']
        if 'password' in kwargs:
            password = kwargs['password']
        if 'root' in kwargs:
            root = kwargs['root']
        if 'proxy' in kwargs:
            proxy = kwargs['proxy']
        if 'timeout' in kwargs:
            timeout = kwargs['timeout']

        self.session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(pool_maxsize=0)
        self.session.mount('http://',  adapter)
        self.session.mount('https://', adapter)
        self.session.headers.update({'Connection':'Keep-Alive'})

        self.connectionStatus = False
        try:
            timeout = int(timeout)
            url = "http://" + host + ":" + str(port)

            hdfsLogin = WebHDFS(url, kdc)
            cookieStr = hdfsLogin.authenticate(user, password)
            if cookieStr != None:
                cookieList = cookieStr.split('=', 1)
                cookieDict = {cookieList[0]: cookieList[1]}
                requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict)

            self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

        self.connectionStatus = True
        return self.client

    def checkConnectionStatus(self):
        return self.connectionStatus

    def list_dir(self, directory):
        output = []
        try:
            if directory != None:
                output = self.client.list(directory, status=True)
            else:
                output = self.client.list(self.client.root, status=True)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def list_names(self, directory):
        output = []
        try:
            if directory != None:
                output = self.client.list(directory, status=False)
            else:
                output = self.client.list(self.client.root, status=False)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def upload(self, remote_path, local_path, overwrite=False, permission=None):
        output = None
        try:
            output = self.client.upload(remote_path, local_path, overwrite, permission=permission)
        except HdfsError as hdfsError:
            # For some reason this exception includes the entire stack trace after
            # the error message, so split on '\n' and only return the first line.
            error = str(hdfsError).splitlines()[0]
            raise HdfsLibraryError(error)
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def download(self, remote_path, local_path, overwrite=False):
        output = None
        try:
            output = self.client.download(remote_path, local_path, overwrite)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def mkdir(self, directory, permission):
        try:
            # no return value
            self.client.makedirs(directory, permission=permission)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def rmdir(self, directory):
        try:
            # no return value
            if self.client.delete(directory, recursive=True) == False:
                raise HdfsLibraryError("Directory does not exist: %r", directory)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def rename(self, src_file, dst_file):
        try:
            # no return value
            self.client.rename(src_file, dst_file)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def delete(self, file):
        try:
            # no return value
            if self.client.delete(file) == False:
                raise HdfsLibraryError("File does not exist: %r", file)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_time(self, file, mod_time):
        try:
            # no return value
            self.client.set_times(file, -1, mod_time)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_owner(self, file, owner, group):
        try:
            # no return value
            self.client.set_owner(file, owner=owner, group=group)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_permission(self, file, permission):
        try:
            # no return value
            self.client.set_permission(file, permission=permission)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_acl(self, file, aclspec):
        try:
            # no return value
            self.client.set_acl(file, aclspec=aclspec)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def status(self, path):
        output = ''
        try:
            output = self.client.status(path)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def checksum(self, path):
        output = ''
        try:
            output = self.client.checksum(path)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def close(self):
        self.session.close()