def update_one_sample(gsmid, ddir='geo', parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor', 'cell type', 'cell line', 'cell pop', 'tissue', 'strain', 'disease','update date','release date']): """Given a gsmid, tries to create a new sample--auto-filling in the meta fields If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample NOTE: will try to save the sample!! Returns newly created sample """ print('+++ description') description_dict = parseGeoInfo(gsmid, ddir) geoPost = postProcessGeo(gsmid, ddir=ddir) if not geoPost: return None if 'species' in parse_fields: print('+++ species') if getFromPost(geoPost, "organism") == "H**O SAPIENS": species = models.Species.objects.get(pk=1) else: species = models.Species.objects.get(pk=2) if ('other_ids' in parse_fields) or ('paper' in parse_fields): print('+++ other IDs') gseId = gsmToGse(gsmid) if not gseId: # sometimes the gse parser failed, but can get it with some tries, strange! time.sleep(0.3) print('+++ again gse') gseId = gsmToGse(gsmid) pmid = gseToPubmed(gseId) if gseId else None if 'other_ids' in parse_fields: import json idList = {'gse': gseId, 'pmid': pmid} print(idList) other_ids = json.dumps(idList) series_id = None try: if idList['gse']: series_id = gse_idToAcc(idList['gse'])#str(idList['gse'][4:]) except: print('cannot find GSE_id') paper = None if 'paper' in parse_fields and pmid: print('+++ paper') try: paper = pubmed.getOrCreatePaper(pmid) except: paper = None if 'name' in parse_fields: print('+++ title') name = getFromPost(geoPost, "title") if 'species' in parse_fields: print('+++ species') if getFromPost(geoPost, "organism") == "H**O SAPIENS": species = models.Species.objects.get(pk=1) else: species = models.Species.objects.get(pk=2) #HERE is where I need to create a classifier app/module #FACTOR, platform, species--HERE are the rest of them! if 'description' in parse_fields: print('+++ add description') description = json.dumps(description_dict) if 'cell type' in parse_fields: print('+++ cell type') tmp_celltype = None # get first parsed cell type information searchCellType = parseAndsearch(description_dict, ['cell type', 'cell lineage', 'cell', 'cell line', 'source name', 'cell description', 'title']) if searchCellType['cellType'] and (str(searchCellType['cellType']).upper() not in [str(searchCellType['cellLine']).upper(), str(searchCellType['tissueType']).upper()]): tmp_celltype = searchCellType['cellType'] # use the cell type if parsed information not in other tables. else use "None" defined before if 'tissue' in parse_fields: print('+++ tissue') tmp_tissue = None searchTisssue = parseAndsearch(description_dict, ['tissue', 'tissue type', 'tissue depot', 'source name', 'cell description', 'title', 'cell type', 'cell lineage','cell', 'cell line']) if searchCellType['tissueType'] and (str(searchCellType['tissueType']).upper() not in [str(searchCellType['cellLine']).upper(), str(searchCellType['cellType']).upper(), str(searchCellType['cellpop']).upper(), str(searchCellType['disease']).upper()]): tmp_tissue = searchTisssue['tissueType'] else: if tmp_tissue: test_tissue = parseAndsearch({'cell type':str(tmp_tissue)}, ['cell type']) # test parsed tissue information whether in cell type table else: test_tissue = {'cellType':None} if test_tissue['cellType']: # means parsed tissue information in cell type table, then ignore tissue tmp_tissue = None if 'disease' in parse_fields: print('+++ disease') disease_state = parseDisease(description_dict) if searchCellType['disease']: disease_state = searchCellType['disease'] if 'cell pop' in parse_fields: print('+++ cell pop') cell_pop = parseCellPop(description_dict) if searchCellType['cellpop']: cell_pop = searchCellType['cellpop'] if 'release date' in parse_fields: geo_release_date = parseReleaseTime(description_dict) # fields for checking sample type sample_path = os.path.join(ddir+'/'+gsmid[:7]+'/'+gsmid+'.xml') os.system('echo %s'%sample_path) xmlContent = scrna_parser._getFieldXML(sample_path) res = [gsmid, str(species), str(series_id), str(pmid), str(paper), str(name), str(tmp_celltype), str(tmp_tissue), str(disease_state), str(cell_pop), str(geo_release_date), str(description_dict), str(xmlContent)] # time.sleep(0.3) return res
def update_one_sample(gseid, ddir='geo', parse_fields=[ 'other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor', 'cell type', 'cell line', 'cell pop', 'tissue', 'strain', 'disease', 'update date', 'release date' ]): """Given a gsmid, tries to create a new sample--auto-filling in the meta fields If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample NOTE: will try to save the sample!! Returns newly created sample """ print('+++ description') fields = [ 'Series/Title', 'Series/Summary', 'Series/Type', 'Series/Overall-Design', 'Series/Status/Release-Date', 'Series/Status/Last-Update-Date', 'Sample/Accession', 'Series/Pubmed-ID' ] sample_path = os.path.join(ddir + '/' + gseid[:7] + '/' + gseid + '.xml') xmlContent = getGEOSamples_byType_gse._getFieldXML(sample_path, fields=fields) if 'species' in parse_fields: print('+++ species') urlcont = _parse_from_html(gseid) if urlcont: species, accession = _parse_species_gsm(urlcont) species = species if species else None accession = accession if accession else None else: species, accession = None, None # if getFromPost(geoPost, "organism") == "H**O SAPIENS": # species = models.Species.objects.get(pk=1) # else: # species = models.Species.objects.get(pk=2) paper = None pmid = None if 'paper' in parse_fields and 'Series/Pubmed-ID' in xmlContent: print('+++ paper') try: pmid = xmlContent['Series/Pubmed-ID'] paper = pubmed.getOrCreatePaper(pmid) except: pmid = None paper = None if 'name' in parse_fields: print('+++ title') name = xmlContent['Series/Title'] #HERE is where I need to create a classifier app/module #FACTOR, platform, species--HERE are the rest of them! if 'description' in parse_fields: print('+++ add description') description = json.dumps(xmlContent) # for cell description_dict = {} for field in [ 'Series/Title', 'Series/Summary', 'Series/Type', 'Series/Overall-Design' ]: description_dict[field] = xmlContent[field] if 'cell type' in parse_fields: print('+++ cell type') tmp_celltype = None # get first parsed cell type information searchCellType = parseAndsearch(description_dict, [ 'cell type', 'cell lineage', 'cell', 'cell line', 'source name', 'cell description', 'title' ]) if searchCellType['cellType'] and (str( searchCellType['cellType']).upper() not in [ str(searchCellType['cellLine']).upper(), str(searchCellType['tissueType']).upper() ]): tmp_celltype = searchCellType[ 'cellType'] # use the cell type if parsed information not in other tables. else use "None" defined before if 'tissue' in parse_fields: print('+++ tissue') tmp_tissue = None searchTisssue = parseAndsearch(description_dict, [ 'tissue', 'tissue type', 'tissue depot', 'source name', 'cell description', 'title', 'cell type', 'cell lineage', 'cell', 'cell line' ]) if searchCellType['tissueType'] and (str( searchCellType['tissueType']).upper() not in [ str(searchCellType['cellLine']).upper(), str(searchCellType['cellType']).upper(), str(searchCellType['cellpop']).upper(), str(searchCellType['disease']).upper() ]): tmp_tissue = searchTisssue['tissueType'] else: if tmp_tissue: test_tissue = parseAndsearch({'cell type': str(tmp_tissue)}, [ 'cell type' ]) # test parsed tissue information whether in cell type table else: test_tissue = {'cellType': None} if test_tissue[ 'cellType']: # means parsed tissue information in cell type table, then ignore tissue tmp_tissue = None if 'disease' in parse_fields: print('+++ disease') disease_state = parseDisease(description_dict) if searchCellType['disease']: disease_state = searchCellType['disease'] if 'cell pop' in parse_fields: print('+++ cell pop') cell_pop = parseCellPop(description_dict) if searchCellType['cellpop']: cell_pop = searchCellType['cellpop'] if 'release date' in parse_fields: geo_release_date = xmlContent['Series/Status/Release-Date'] geo_last_update_date = xmlContent['Series/Status/Last-Update-Date'] xmlContent.pop('Sample/Accession' ) if 'Sample/Accession' in xmlContent.keys() else None res = [ gseid, str(species), str(pmid), str(paper), str(name), str(tmp_celltype), str(tmp_tissue), str(disease_state), str(cell_pop), str(geo_release_date), str(geo_last_update_date), str(accession), str(xmlContent) ] time.sleep(3) return res
def update_one_sample(gsmid, parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor', 'cell type', 'cell line', 'cell pop', 'tissue', 'strain', 'disease','update date','release date']): """Given a gsmid, tries to create a new sample--auto-filling in the meta fields If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample NOTE: will try to save the sample!! Returns newly created sample """ sraId = gsmToSra(gsmid) sraXML = sra.getSraXML(sraId) if sraId else None geoPost = postProcessGeo(gsmid) if not geoPost: return None s, created = models.Samples.objects.get_or_create(unique_id=gsmid) assert isinstance(s, models.Samples) if 'species' in parse_fields: if getFromPost(geoPost, "organism") == "H**O SAPIENS": s.species = models.Species.objects.get(pk=1) else: s.species = models.Species.objects.get(pk=2) if ('other_ids' in parse_fields) or ('paper' in parse_fields): gseId = gsmToGse(gsmid) pmid = gseToPubmed(gseId) if gseId else None if 'other_ids' in parse_fields: idList = {'sra': sraId, 'gse': gseId, 'pmid': pmid} s.other_ids = json.dumps(idList) if 'paper' in parse_fields and pmid: s.paper = pubmed.getOrCreatePaper(pmid) if 'name' in parse_fields: s.name = getFromPost(geoPost, "title") if 'species' in parse_fields: if getFromPost(geoPost, "organism") == "H**O SAPIENS": s.species = models.Species.objects.get(pk=1) else: s.species = models.Species.objects.get(pk=2) #HERE is where I need to create a classifier app/module #FACTOR, platform, species--HERE are the rest of them! description_dict = parseGeoInfo(gsmid) if 'description' in parse_fields: s.description = json.dumps(description_dict) print s.description if 'antibody' in parse_fields: s.antibody = parseAntibody(description_dict) if 'factor' in parse_fields: s.factor = parseFactor(description_dict) if 'cell type' in parse_fields: s.cell_type = parseCellType(description_dict) if 'tissue' in parse_fields: s.tissue_type = parseTissue(description_dict) if 'cell line' in parse_fields: s.cell_line = parseCellLine(description_dict) # # Sometimes cell line name is the `source name` field, especially when the content in `source name` is short # if not s.tissue_type and not s.cell_line: # s.cell_line = parseCellLineBySourceName(description_dict) if 'strain' in parse_fields: s.strain = parseStrain(description_dict) if 'disease' in parse_fields: s.disease_state = parseDisease(description_dict) if 'cell pop' in parse_fields: s.cell_pop = parseCellPop(description_dict) if 'update date' in parse_fields: s.geo_last_update_date = parseUpdateTime(description_dict) if 'release date' in parse_fields: s.geo_release_date = parseReleaseTime(description_dict) s.save() return s