class ClinicalData(db.Document): ''' 索引方式:project-->subproject-->patient,不可能同一个患者两次出现在同一个subproject ''' age = db.FloatField(min_value=0.001, max_value=1200) # 单位: 月 age_began_smoking_in_years = db.IntField(min_value=1900, max_value=2999) alcohol_history_documented = db.BooleanField() # 0: 无饮酒史,1: 有饮酒史 amount_of_alcohol_comsumption_per_day = db.FloatField(min_value=0.01, max_value=100) frequency_of_alcohol_consumption = db.IntField(min_value=1, max=7) gender = db.BooleanField() # 0: 女性 1: 男性 height = db.FloatField(min_value=30, max_value=300) lymph_node_examined_count = db.FloatField(min_value=0.1, max_value=100) number_of_lymphnodes_position_by_he = db.FloatField(min_value=0.1, max_value=100) number_of_lymphnodes_position_by_ihc = db.FloatField(min_value=0.1, max_value=100) number_pack_years_smoked = db.FloatField(min_value=0.01, max_value=100) OS = db.FloatField(min_value=0.01, max_value=1200) OS_IND = db.BooleanField() patient_id = db.StringField(max_length=50, required=True) project_ref = db.ReferenceField(Project, required=True) project_name = db.StringField(max_length=30, required=True) subproject_name = db.StringField(max_length=30, required=True) RFS = db.FloatField(min_value=0.01, max_value=1200) RFS_IND = db.BooleanField() stopped_smoking_year = db.IntField(min_value=1900, max_value=2999) tobacco_smoking_history = db.BooleanField() weight = db.FloatField(min_value=1, max_value=500)
class Project(db.Document): ''' data_file_md5: 可能有多个原始数据文件需要导入,因此设置为List类别 src_project_id: 当前project的数据可能来源于多个原project,因此设置为List类别 src_project对应subproject,用于将多个project聚合成一个project的场景,每次聚合都意味着数据经过了某种处理 ''' analysis_pipeline_ref = db.ReferenceField(AnalysisProgram, required=True) created_date = db.DateTimeField(required=True) data_file_md5_lst = db.ListField(db.StringField(max_length=32, required=True), required=True) data_file_name_lst = db.ListField(db.StringField(max_length=255, required=True), required=True) data_type = db.StringField(max_length=10, required=True) description = db.StringField(max_length=512) import_data_program_ref = db.ReferenceField(AnalysisProgram, required=True) normalized = db.BooleanField() normalized_method = db.StringField(max_length=10, required=True) num_of_samples = db.IntField(required=True) project_name = db.StringField(max_length=30, required=True) src_project_id_ref_lst = db.ListField(db.ReferenceField('Project', required=True), required=True) src_project_name_lst = db.ListField(db.StringField(max_length=30, required=True), required=True) url = db.StringField(max_length=100) version = db.StringField(max_length=10, required=True)
class TranscriptExpr(db.Document): ''' 索引方式:project-->subproject-->transcript,通过subproject限定检索范围,一个subproject可以唯一确定一个transcript ''' clinical_data_id_lst = db.ListField(db.ReferenceField('ClinicalData', required=True), required=True) expr_value_lst = db.ListField(db.FloatField(), required=True) phenotype_data_id_lst = db.ListField(db.ReferenceField('PhenotypeData', required=True), required=True) project_name = db.StringField(max_length=30, required=True) project_ref = db.ReferenceField(Project, required=True) samples_data_id_lst = db.ListField(db.ReferenceField('SampleData', required=True), required=True) source_type = db.StringField(max_length=50, required=True) species = db.StringField(max_length=10, required=True) subproject_name = db.StringField(max_length=30, required=True) transcript_ensembl_id = db.StringField(max_length=50, required=True)
class GeneExpr(db.Document): ''' 索引方式:project-->subproject-->gene,通过subproject限定检索范围,一个subproject可以唯一确定一个gene 性能陷阱:sample_id和clinical_data_id等只可用于索引表达值,不可用于检索确定基因,否则将带来性能问题,换而言之, 基因或转录本的唯一确定是由project-subproject决定的,而非样本或患者 ''' clinical_data_id_lst = db.ListField(db.ReferenceField('ClinicalData', required=True), required=True) expr_value_lst = db.ListField(db.FloatField(), required=True) gene_ensembl_id = db.StringField(max_length=50, required=True) project_ref = db.ReferenceField(Project, required=True) project_name = db.StringField(max_length=30, required=True) subproject_name = db.StringField(max_length=30, required=True) samples_data_id_lst = db.ListField(db.ReferenceField('SampleData', required=True), required=True) species = db.StringField(max_length=10, required=True) source_type = db.StringField(max_length=50, required=True) phenotype_data_id_lst = db.ListField(db.ReferenceField('PhenotypeData', required=True), required=True)
class AnalysisProgram(db.Document): created_author = db.StringField(max_length=20, required=True) created_date = db.DateTimeField(required=True) description = db.StringField(max_length=500, required=False) document_file = db.StringField(max_length=255, required=True) md5 = db.StringField(max_length=32, required=True) path = db.StringField(max_length=255, required=True) program_name = db.StringField(max_length=100, required=True)
class SampleData(db.Document): ''' 索引方式:project-->subproject-->sample,不可能同一个样本两次出现在同一个subproject ''' clinical_data_id_ref = db.ReferenceField(ClinicalData, required=True) concentration = db.FloatField(min_value=0.001, max_value=1000) ERCC = db.StringField(max_length=10) FFPE = db.BooleanField() histological_type = db.StringField(max_length=30, required=True) od260_280 = db.FloatField(min_value=0.001, max_value=1000) primary_site = db.StringField(max_length=30, required=True) project_ref = db.ReferenceField(Project, required=True) project_name = db.StringField(max_length=30, required=True) subproject_name = db.StringField(max_length=30, required=True) reads = db.FloatField(min_value=0.01, max_value=1000) rin = db.FloatField(min_value=0.001, max_value=1000) sample_id = db.StringField(max_length=50, required=True) source = db.StringField(max_length=30, required=True) species = db.StringField(max_length=10, required=True) tissue_histological_subtype = db.StringField(max_length=30, required=False) tissue_molecular_subtype = db.StringField(max_length=30, required=False) volume = db.FloatField(min_value=0.001, max_value=1000) weight = db.FloatField(min_value=0.001, max_value=1000) yields = db.FloatField(min_value=0.01, max_value=1000)
class ExprInfo(db.Document): ''' 临时资源,用于多ID查询的场景 ''' clinical_data_id_lst = db.ListField( db.ReferenceField('ClinicalData', required=True)) expr_value_lst = db.ListField(db.FloatField(), required=True) transcript_ensembl_id_lst = db.ListField( db.StringField(max_length=50, required=True)) gene_ensembl_id_lst = db.ListField( db.StringField(max_length=50, required=True)) project_ref = db.ReferenceField(Project, required=True) subproject_name_lst = db.ListField( db.StringField(max_length=30, required=True)) project_name = db.StringField(max_length=30, required=True) samples_data_id_lst = db.ListField( db.ReferenceField('SampleData', required=True)) species_lst = db.ListField(db.StringField(max_length=10, required=True)) source_type_lst = db.ListField(db.StringField(max_length=50, required=True), required=True) phenotype_data_id_lst = db.ListField( db.ReferenceField('PhenotypeData', required=True)) query_condition_md5 = db.StringField(max_length=128, required=True)
class Mutation(db.Document): chromosome = db.StringField(max_length=10, required=True) clinical_data_id_lst = db.ListField(db.ReferenceField('ClinicalData', required=True), required=True) dbsnp_rs = db.StringField(max_length=50, required=True) end_position = db.StringField(max_length=50, required=True) gene_ensembl_id = db.StringField(max_length=50, required=True) transcript_ensembl_id = db.StringField(max_length=50, required=True) phenotype_data_id_lst = db.ListField(db.ReferenceField('PhenotypeData', required=True), required=True) project_name = db.StringField(max_length=30, required=True) project_ref = db.ReferenceField(Project, required=True) protein_change = db.StringField(max_length=50, required=True) reference_allele = db.StringField(max_length=50, required=True) samples_data_id_lst = db.ListField(db.ReferenceField('SampleData', required=True), required=True) source_type = db.StringField(max_length=50, required=True) species = db.StringField(max_length=10, required=True) start_position = db.StringField(max_length=50, required=True) subproject_name = db.StringField(max_length=30, required=True) transcript_strand = db.StringField(max_length=50, required=True) tumor_sample_barcode = db.StringField(max_length=50, required=True) tumor_seq_allele1 = db.StringField(max_length=50, required=True) tumor_seq_allele2 = db.StringField(max_length=50, required=True) variant_classification = db.StringField(max_length=50, required=True) variant_type = db.StringField(max_length=50, required=True)