def update_one_sample(gsmid, ddir='geo', parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor',
                                           'cell type', 'cell line', 'cell pop', 'tissue', 'strain', 'disease','update date','release date']):
    """Given a gsmid, tries to create a new sample--auto-filling in the
    meta fields


    If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample

    NOTE: will try to save the sample!!

    Returns newly created sample
    """

    print('+++ description')
    description_dict = parseGeoInfo(gsmid, ddir)

    geoPost = postProcessGeo(gsmid, ddir=ddir)
    if not geoPost:
        return None

    if 'species' in parse_fields:
        print('+++ species')
        if getFromPost(geoPost, "organism") == "H**O SAPIENS":
            species = models.Species.objects.get(pk=1)
        else:
            species = models.Species.objects.get(pk=2)

    if ('other_ids' in parse_fields) or ('paper' in parse_fields):
        print('+++ other IDs')
        gseId = gsmToGse(gsmid)
        if not gseId: # sometimes the gse parser failed, but can get it with some tries, strange!
            time.sleep(0.3)
            print('+++ again gse')
            gseId = gsmToGse(gsmid)
        pmid = gseToPubmed(gseId) if gseId else None

    if 'other_ids' in parse_fields:
        import json 
        idList = {'gse': gseId, 'pmid': pmid}
        print(idList)
        other_ids = json.dumps(idList)
        series_id = None
        try:
            if idList['gse']:
                series_id = gse_idToAcc(idList['gse'])#str(idList['gse'][4:])
        except:
            print('cannot find GSE_id')
    
    paper = None
    if 'paper' in parse_fields and pmid:
        print('+++ paper')
        try:
            paper = pubmed.getOrCreatePaper(pmid)
        except:
            paper = None

    if 'name' in parse_fields:
        print('+++ title')
        name = getFromPost(geoPost, "title")

    if 'species' in parse_fields:
        print('+++ species')
        if getFromPost(geoPost, "organism") == "H**O SAPIENS":
            species = models.Species.objects.get(pk=1)
        else:
            species = models.Species.objects.get(pk=2)

    #HERE is where I need to create a classifier app/module
    #FACTOR, platform, species--HERE are the rest of them!

    if 'description' in parse_fields:
        print('+++ add description')
        description = json.dumps(description_dict)

    if 'cell type' in parse_fields:
        print('+++ cell type')
        tmp_celltype = None
        # get first parsed cell type information
        searchCellType = parseAndsearch(description_dict, ['cell type', 'cell lineage', 'cell', 'cell line', 'source name', 'cell description', 'title'])
        if searchCellType['cellType'] and (str(searchCellType['cellType']).upper() not in [str(searchCellType['cellLine']).upper(), str(searchCellType['tissueType']).upper()]):
            tmp_celltype = searchCellType['cellType'] # use the cell type if parsed information not in other tables. else use "None" defined before
    
    if 'tissue' in parse_fields:
        print('+++ tissue')
        tmp_tissue = None
        searchTisssue = parseAndsearch(description_dict, ['tissue', 'tissue type', 'tissue depot', 'source name', 'cell description', 'title', 'cell type', 'cell lineage','cell', 'cell line'])
        if searchCellType['tissueType'] and (str(searchCellType['tissueType']).upper() not in [str(searchCellType['cellLine']).upper(), str(searchCellType['cellType']).upper(), str(searchCellType['cellpop']).upper(), str(searchCellType['disease']).upper()]):
            tmp_tissue = searchTisssue['tissueType']
        else:
            if tmp_tissue:
                test_tissue = parseAndsearch({'cell type':str(tmp_tissue)}, ['cell type']) # test parsed tissue information whether in cell type table
            else:
                test_tissue = {'cellType':None}
            if test_tissue['cellType']: # means parsed tissue information in cell type table, then ignore tissue
                tmp_tissue = None

    if 'disease' in parse_fields:
        print('+++ disease')
        disease_state = parseDisease(description_dict)
        if searchCellType['disease']:
            disease_state = searchCellType['disease']

    if 'cell pop' in parse_fields:
        print('+++ cell pop')
        cell_pop = parseCellPop(description_dict)
        if searchCellType['cellpop']:
            cell_pop = searchCellType['cellpop']

    if 'release date' in parse_fields:
        geo_release_date = parseReleaseTime(description_dict)
    # fields for checking sample type
    sample_path = os.path.join(ddir+'/'+gsmid[:7]+'/'+gsmid+'.xml')
    os.system('echo %s'%sample_path)
    xmlContent = scrna_parser._getFieldXML(sample_path)

    res = [gsmid, str(species), str(series_id), str(pmid), str(paper), str(name), str(tmp_celltype),
        str(tmp_tissue), str(disease_state), str(cell_pop), str(geo_release_date), str(description_dict), str(xmlContent)]
    # time.sleep(0.3)
    return res
Exemple #2
0
def update_one_sample(gseid,
                      ddir='geo',
                      parse_fields=[
                          'other_ids', 'paper', 'name', 'species',
                          'description', 'antibody', 'factor', 'cell type',
                          'cell line', 'cell pop', 'tissue', 'strain',
                          'disease', 'update date', 'release date'
                      ]):
    """Given a gsmid, tries to create a new sample--auto-filling in the
    meta fields


    If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample

    NOTE: will try to save the sample!!

    Returns newly created sample
    """

    print('+++ description')
    fields = [
        'Series/Title', 'Series/Summary', 'Series/Type',
        'Series/Overall-Design', 'Series/Status/Release-Date',
        'Series/Status/Last-Update-Date', 'Sample/Accession',
        'Series/Pubmed-ID'
    ]
    sample_path = os.path.join(ddir + '/' + gseid[:7] + '/' + gseid + '.xml')
    xmlContent = getGEOSamples_byType_gse._getFieldXML(sample_path,
                                                       fields=fields)

    if 'species' in parse_fields:
        print('+++ species')
        urlcont = _parse_from_html(gseid)
        if urlcont:
            species, accession = _parse_species_gsm(urlcont)
            species = species if species else None
            accession = accession if accession else None
        else:
            species, accession = None, None
        # if getFromPost(geoPost, "organism") == "H**O SAPIENS":
        #     species = models.Species.objects.get(pk=1)
        # else:
        #     species = models.Species.objects.get(pk=2)

    paper = None
    pmid = None
    if 'paper' in parse_fields and 'Series/Pubmed-ID' in xmlContent:
        print('+++ paper')
        try:
            pmid = xmlContent['Series/Pubmed-ID']
            paper = pubmed.getOrCreatePaper(pmid)
        except:
            pmid = None
            paper = None

    if 'name' in parse_fields:
        print('+++ title')
        name = xmlContent['Series/Title']

    #HERE is where I need to create a classifier app/module
    #FACTOR, platform, species--HERE are the rest of them!

    if 'description' in parse_fields:
        print('+++ add description')
        description = json.dumps(xmlContent)
    # for cell
    description_dict = {}
    for field in [
            'Series/Title', 'Series/Summary', 'Series/Type',
            'Series/Overall-Design'
    ]:
        description_dict[field] = xmlContent[field]
    if 'cell type' in parse_fields:
        print('+++ cell type')
        tmp_celltype = None
        # get first parsed cell type information
        searchCellType = parseAndsearch(description_dict, [
            'cell type', 'cell lineage', 'cell', 'cell line', 'source name',
            'cell description', 'title'
        ])
        if searchCellType['cellType'] and (str(
                searchCellType['cellType']).upper() not in [
                    str(searchCellType['cellLine']).upper(),
                    str(searchCellType['tissueType']).upper()
                ]):
            tmp_celltype = searchCellType[
                'cellType']  # use the cell type if parsed information not in other tables. else use "None" defined before

    if 'tissue' in parse_fields:
        print('+++ tissue')
        tmp_tissue = None
        searchTisssue = parseAndsearch(description_dict, [
            'tissue', 'tissue type', 'tissue depot', 'source name',
            'cell description', 'title', 'cell type', 'cell lineage', 'cell',
            'cell line'
        ])
        if searchCellType['tissueType'] and (str(
                searchCellType['tissueType']).upper() not in [
                    str(searchCellType['cellLine']).upper(),
                    str(searchCellType['cellType']).upper(),
                    str(searchCellType['cellpop']).upper(),
                    str(searchCellType['disease']).upper()
                ]):
            tmp_tissue = searchTisssue['tissueType']
        else:
            if tmp_tissue:
                test_tissue = parseAndsearch({'cell type': str(tmp_tissue)}, [
                    'cell type'
                ])  # test parsed tissue information whether in cell type table
            else:
                test_tissue = {'cellType': None}
            if test_tissue[
                    'cellType']:  # means parsed tissue information in cell type table, then ignore tissue
                tmp_tissue = None

    if 'disease' in parse_fields:
        print('+++ disease')
        disease_state = parseDisease(description_dict)
        if searchCellType['disease']:
            disease_state = searchCellType['disease']

    if 'cell pop' in parse_fields:
        print('+++ cell pop')
        cell_pop = parseCellPop(description_dict)
        if searchCellType['cellpop']:
            cell_pop = searchCellType['cellpop']

    if 'release date' in parse_fields:
        geo_release_date = xmlContent['Series/Status/Release-Date']
        geo_last_update_date = xmlContent['Series/Status/Last-Update-Date']
    xmlContent.pop('Sample/Accession'
                   ) if 'Sample/Accession' in xmlContent.keys() else None

    res = [
        gseid,
        str(species),
        str(pmid),
        str(paper),
        str(name),
        str(tmp_celltype),
        str(tmp_tissue),
        str(disease_state),
        str(cell_pop),
        str(geo_release_date),
        str(geo_last_update_date),
        str(accession),
        str(xmlContent)
    ]
    time.sleep(3)
    return res
Exemple #3
0
def update_one_sample(gsmid, parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor',
                                           'cell type', 'cell line', 'cell pop', 'tissue', 'strain', 'disease','update date','release date']):
    """Given a gsmid, tries to create a new sample--auto-filling in the
    meta fields


    If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample

    NOTE: will try to save the sample!!

    Returns newly created sample
    """

    sraId = gsmToSra(gsmid)
    sraXML = sra.getSraXML(sraId) if sraId else None

    geoPost = postProcessGeo(gsmid)
    if not geoPost:
        return None

    s, created = models.Samples.objects.get_or_create(unique_id=gsmid)
    assert isinstance(s, models.Samples)

    if 'species' in parse_fields:
        if getFromPost(geoPost, "organism") == "H**O SAPIENS":
            s.species = models.Species.objects.get(pk=1)
        else:
            s.species = models.Species.objects.get(pk=2)

    if ('other_ids' in parse_fields) or ('paper' in parse_fields):
        gseId = gsmToGse(gsmid)
        pmid = gseToPubmed(gseId) if gseId else None

    if 'other_ids' in parse_fields:
        idList = {'sra': sraId, 'gse': gseId, 'pmid': pmid}
        s.other_ids = json.dumps(idList)

    if 'paper' in parse_fields and pmid:
        s.paper = pubmed.getOrCreatePaper(pmid)

    if 'name' in parse_fields:
        s.name = getFromPost(geoPost, "title")

    if 'species' in parse_fields:
        if getFromPost(geoPost, "organism") == "H**O SAPIENS":
            s.species = models.Species.objects.get(pk=1)
        else:
            s.species = models.Species.objects.get(pk=2)

    #HERE is where I need to create a classifier app/module
    #FACTOR, platform, species--HERE are the rest of them!

    description_dict = parseGeoInfo(gsmid)
    if 'description' in parse_fields:
        s.description = json.dumps(description_dict)
        print s.description

    if 'antibody' in parse_fields:
        s.antibody = parseAntibody(description_dict)

    if 'factor' in parse_fields:
        s.factor = parseFactor(description_dict)

    if 'cell type' in parse_fields:
        s.cell_type = parseCellType(description_dict)

    if 'tissue' in parse_fields:
        s.tissue_type = parseTissue(description_dict)

    if 'cell line' in parse_fields:
        s.cell_line = parseCellLine(description_dict)

        # # Sometimes cell line name is the `source name` field, especially when the content in `source name` is short
        # if not s.tissue_type and not s.cell_line:
        #     s.cell_line = parseCellLineBySourceName(description_dict)

    if 'strain' in parse_fields:
        s.strain = parseStrain(description_dict)

    if 'disease' in parse_fields:
        s.disease_state = parseDisease(description_dict)

    if 'cell pop' in parse_fields:
        s.cell_pop = parseCellPop(description_dict)

    if 'update date' in parse_fields:
        s.geo_last_update_date = parseUpdateTime(description_dict)

    if 'release date' in parse_fields:
        s.geo_release_date = parseReleaseTime(description_dict)

    s.save()
    return s