Ejemplo n.º 1
0
def collectMeta(a):
    amdms = m.ArticleMetaDataMap.objects.filter(article = a)
    curr_metadata_list = ['']*(len(nom_vars) + len(cont_vars))
    
    # Process metadata for nominal vars
    for i,v in enumerate(nom_vars):
        valid_vars = amdms.filter(metadata__name = v)
        temp_metadata_list = [vv.metadata.value for vv in valid_vars]
        if 'in vitro' in temp_metadata_list and 'cell culture' in temp_metadata_list:
            curr_metadata_list[i] = 'cell culture'
        elif v == 'Strain' and amdms.filter(metadata__value = 'Mice').count() > 0:
            temp_metadata_list = 'C57BL'
            curr_metadata_list[i] = 'C57BL'
        elif v == 'Strain' and amdms.filter(metadata__value = 'Guinea Pigs').count() > 0:
            temp_metadata_list = 'Guinea Pigs'
            curr_metadata_list[i] = 'Guinea Pigs'
        elif len(temp_metadata_list) == 0 and v == 'Strain':
            if amdms.filter(metadata__value = 'Rats').count() > 0:
                if np.random.randn(1)[0] > 0:
                    curr_metadata_list[i] = 'Sprague-Dawley'
                else:
                    curr_metadata_list[i] = 'Wistar'
        elif len(temp_metadata_list) > 1: 
            temp_metadata_list = temp_metadata_list[0]
            curr_metadata_list[i] = temp_metadata_list
        else:
            curr_metadata_list[i] = u'; '.join(temp_metadata_list)
            
    # Process metadata for continuous vars
    for i,v in enumerate(cont_vars):
        valid_vars = amdms.filter(metadata__name = v)
        if valid_vars.count() > 0:
            cont_value_ob = valid_vars[0].metadata.cont_value.mean
            curr_metadata_list[i+num_nom_vars] = cont_value_ob
        else:
            # check if 
            if v == 'RecTemp' and amdms.filter(metadata__value = 'in vivo').count() > 0:
                curr_metadata_list[i+num_nom_vars] = 37.0
       
    pubmed_link_str = pubmed_base_link_str % a.pmid
    article_link_str = article_base_link_str % a.pk
    
    last_author = get_article_last_author(a)
    if last_author is not None:
        last_author_name = '%s %s' % (last_author.last, last_author.initials)
        last_author_name = last_author_name.encode("utf8", "replace")
    else:
        last_author_name = ''
            
    curr_meta_list = []
    
    curr_meta_list.append(pubmed_link_str)
    curr_meta_list.append((a.title).encode("utf8", "replace"))
    curr_meta_list.append(a.journal)
    curr_meta_list.append(a.pub_year)
    curr_meta_list.append(article_link_str)
    curr_meta_list.append(last_author_name)
    curr_meta_list.extend(curr_metadata_list)

    return curr_meta_list
Ejemplo n.º 2
0
def assign_ephys_grandfather(article):
    """
    Assign 1 of N ephys grandfathers to a NeuroElectro article object
    by searching NeuroTree
    """
    result = None

    grandfather_list = define_ephys_grandfathers()
    last_author_ob = get_article_last_author(article)
    if last_author_ob is not None:
        a_node = get_neurotree_author(last_author_ob)
        if a_node is not None:
            closest_grandfather = get_closest_grandfather(a_node, grandfather_list)
            result = closest_grandfather
    return result
Ejemplo n.º 3
0
def assign_articles_grandfathers():
    """
    Assign ephys grandfathers to each article containing 
    ephys data in NeuroElectro.
    """
    q1 = Q(datatable__datasource__neuronconceptmap__times_validated__gte = 1)
    q2 = Q(usersubmission__datasource__neuronconceptmap__times_validated__gte = 1)
    articles = m.Article.objects.filter(q1 | q2).distinct()
    article_info_list = []
    for article in articles:
        grandfather = assign_ephys_grandfather(article)
        author = get_article_last_author(article)
        if author is not None:
            neurotree_node = get_neurotree_author(author)
        else:
            neurotree_node = None
        article_info = [author, neurotree_node, grandfather]
        article_info_list.append(article_info)
    return article_info_list
def getAllArticleNedmMetadataSummary(getAllMetadata = False):
    """The old function for exporting the DB to a csv file, added here for reference"""
# TODO: uncomment and remove unnecessary metadata
#     articles = m.Article.objects.filter(Q(datatable__datasource__neuronconceptmap__times_validated__gte = 1) |
#         Q(usersubmission__datasource__neuronconceptmap__times_validated__gte = 1)).distinct()
#     articles = articles.filter(articlefulltext__articlefulltextstat__metadata_human_assigned = True ).distinct()
    articles = m.Article.objects.all()

    nom_vars = ['Species', 'Strain', 'ElectrodeType', 'PrepType', 'JxnPotential']
    cont_vars  = ['JxnOffset', 'RecTemp', 'AnimalAge', 'AnimalWeight', 'FlagSoln']
    cont_var_headers = ['JxnOffset', 'Temp', 'Age', 'Weight', 'FlagSoln']
    if getAllMetadata:
        for i in range(0, 5):
            cont_vars.extend(['external_%s_Mg' % i, 'external_%s_Ca' % i, 'external_%s_Na' % i, 'external_%s_Cl' % i, 'external_%s_K' % i, 'external_%s_pH' % i, 'external_%s_text' % i, 'internal_%s_Mg' % i, 'internal_%s_Ca' % i, 'internal_%s_Na' % i, 'internal_%s_Cl' % i, 'internal_%s_K' % i, 'internal_%s_pH' % i, 'internal_%s_text' % i])
            cont_var_headers.extend(['External_%s_Mg' % i, 'External_%s_Ca' % i, 'External_%s_Na' % i, 'External_%s_Cl' % i, 'External_%s_K' % i, 'External_%s_pH' % i, 'External_%s_text' % i, 'Internal_%s_Mg' % i, 'Internal_%s_Ca' % i, 'Internal_%s_Na' % i, 'Internal_%s_Cl' % i, 'Internal_%s_K' % i, 'Internal_%s_pH' % i, 'Internal_%s_text' % i])
    num_nom_vars = len(nom_vars)
    ephys_use_pks = range(1,28)

    ephys_list = m.EphysProp.objects.filter(pk__in = ephys_use_pks)
    ephys_headers = []
    for e in ephys_list:
        ephys_name_str = re.sub("[\s-]", "", e.name.title())
        ephys_headers.append(ephys_name_str)

    csvout = csv.writer(open(settings.OUTPUT_FILES_DIRECTORY + "article_ephys_metadata_curated.csv", "w+b"), delimiter = '\t')

    other_headers = ['NeuronType', 'Title', 'Journal', 'PubYear', 'PubmedLink', 'DataTableLinks', 'ArticleDataLink', 'LastAuthor']
    all_headers = other_headers
    all_headers.extend(ephys_headers)
    all_headers.extend(nom_vars + cont_var_headers)

    pubmed_base_link_str = 'http://www.ncbi.nlm.nih.gov/pubmed/%d/'
    table_base_link_str = 'http://neuroelectro.org/data_table/%d/'
    article_base_link_str = 'http://neuroelectro.org/article/%d/'

    csvout.writerow(all_headers)
    for a in articles:
        print "processing metadata for article: %s" % a.pk
        amdms = m.ArticleMetaDataMap.objects.filter(article = a)
        curr_metadata_list = ['']*(len(nom_vars) + len(cont_vars))
        for i,v in enumerate(nom_vars):
            valid_vars = amdms.filter(metadata__name = v)
            temp_metadata_list = [vv.metadata.value for vv in valid_vars]
            if 'in vitro' in temp_metadata_list and 'cell culture' in temp_metadata_list:
                curr_metadata_list[i] = 'cell culture'
            elif v == 'Strain' and amdms.filter(metadata__value = 'Mice').count() > 0:
                temp_metadata_list = 'C57BL'
                curr_metadata_list[i] = 'C57BL'
            elif v == 'Strain' and amdms.filter(metadata__value = 'Guinea Pigs').count() > 0:
                temp_metadata_list = 'Guinea Pigs'
                curr_metadata_list[i] = 'Guinea Pigs'
            elif len(temp_metadata_list) == 0 and v == 'Strain':
                if amdms.filter(metadata__value = 'Rats').count() > 0:
                    if np.random.randn(1)[0] > 0:
                        curr_metadata_list[i] = 'Sprague-Dawley'
                    else:
                        curr_metadata_list[i] = 'Wistar'
            elif len(temp_metadata_list) > 1:
                temp_metadata_list = temp_metadata_list[0]
                curr_metadata_list[i] = temp_metadata_list
            else:
                curr_metadata_list[i] = u'; '.join(temp_metadata_list)
        for i,v in enumerate(cont_vars):
            valid_vars = amdms.filter(metadata__name = v)
            if valid_vars.count() > 0:
                cont_value_ob = valid_vars[0].metadata.cont_value.mean
                curr_metadata_list[i+num_nom_vars] = cont_value_ob
            else:
                # check if
                if v == 'RecTemp' and amdms.filter(metadata__value = 'in vivo').count() > 0:
                    curr_metadata_list[i+num_nom_vars] = 37.0
                elif 'text' in v and ('external' in v or 'internal' in v):
                    for j in range(i - 6, i - 1, 1):
                        conc_amdm = amdms.filter(metadata__name = cont_vars[j])
                        if len(conc_amdm) > 0:
                            curr_metadata_list[i+num_nom_vars] = conc_amdm[0].metadata.ref_text.text.encode('utf8', "replace")
                            break
                        else:
                            curr_metadata_list[i+num_nom_vars] = 'NaN'
                else:
                    curr_metadata_list[i+num_nom_vars] = 'NaN'

# TODO: uncomment these 2 lines
        neurons = m.Neuron.objects.filter(Q(neuronconceptmap__times_validated__gte = 1) &
            ( Q(neuronconceptmap__source__data_table__article = a) | Q(neuronconceptmap__source__user_submission__article = a))).distinct()
        neurons = m.Neuron.objects.filter( Q(neuronconceptmap__source__data_table__article = a) | Q(neuronconceptmap__source__user_submission__article = a)).distinct()

        pubmed_link_str = pubmed_base_link_str % a.pmid
        article_link_str = article_base_link_str % a.pk
        dts = m.DataTable.objects.filter(article = a, datasource__neuronconceptmap__times_validated__gte = 1).distinct()
        if dts.count() > 0:
            dt_link_list = [table_base_link_str % dt.pk for dt in dts]
            dt_link_str = u'; '.join(dt_link_list)
        else:
            dt_link_str = ''

        #grandfather = define_ephys_grandfather(a)
        # grandfather = None
        # if grandfather is not None:
        #     grandfather_name = grandfather.lastname
        #     grandfather_name = grandfather_name.encode("iso-8859-15", "replace")
        # else:
        #     grandfather_name = ''
        last_author = get_article_last_author(a)
        if last_author is not None:
            last_author_name = '%s %s' % (last_author.last, last_author.initials)
            last_author_name = last_author_name.encode("utf8", "replace")
            # if grandfather_name is '':
            #     neuro_tree_node = get_neurotree_author(last_author)
            #     if neuro_tree_node is None:
            #         grandfather_name = 'Node not found'
        else:
            last_author_name = ''

        for n in neurons:
            curr_ephys_prop_list = []

            curr_ephys_prop_list.append(n.name)
            curr_ephys_prop_list.append((a.title).encode("utf8", "replace"))
            curr_ephys_prop_list.append(a.journal)
            curr_ephys_prop_list.append(a.pub_year)
            curr_ephys_prop_list.append(pubmed_link_str)
            curr_ephys_prop_list.append(dt_link_str)
            curr_ephys_prop_list.append(article_link_str)
            curr_ephys_prop_list.append(last_author_name)

            for e in ephys_list:
                curr_ephys_prop_list.append(computeArticleNedmSummary(a.pmid, n, e))

            curr_ephys_prop_list.extend(curr_metadata_list)
            #curr_ephys_prop_list.append(grandfather_name)

            csvout.writerow(curr_ephys_prop_list)
    return articles
def export_db_to_data_frame():
    """Returns a nicely formatted pandas data frame of the ephys data and metadata for each stored article"""

    ncms = m.NeuronConceptMap.objects.all()#.order_by('-history__latest__history_date') # gets human-validated neuron mappings
    ncms = ncms.exclude(Q(source__data_table__irrelevant_flag = True) | Q(source__data_table__needs_expert = True)) # exclude
    ncm_count = ncms.count()
    ephys_props = m.EphysProp.objects.all().order_by('-ephyspropsummary__num_neurons')
    ephys_names = []
    for e in ephys_props:
        ephys_names.append(e.short_name)
        ephys_names.append(e.short_name + '_err')
        ephys_names.append(e.short_name + '_n')
        ephys_names.append(e.short_name + '_sd')
        ephys_names.append(e.short_name + '_note')
    #ephys_names = [e.name for e in ephys_props]
    #ncms = ncms.sort('-changed_on')
    dict_list = []
    for kk, ncm in enumerate(ncms):
        prog(kk, ncm_count)

    # TODO: need to check whether nedms under the same ncm have different experimental factor concept maps
    #     # check if any nedms have any experimental factors assoc with them
    #     efcms = ne_db.ExpFactConceptMap.objects.filter(neuronephysdatamap__in = nedms)
    #     for efcm in efcms:
    #         nedms = ne_db.NeuronEphysDataMap.objects.filter(neuron_concept_map = ncm, exp_fact_concept_map = ).distinct()

        nedms = m.NeuronEphysDataMap.objects.filter(neuron_concept_map = ncm, expert_validated = True).distinct()
        if nedms.count() == 0:
            continue

        sd_errors = identify_stdev(nedms)

        temp_dict = dict()
        temp_metadata_list = []
        for nedm in nedms:
            e = nedm.ephys_concept_map.ephys_prop
            # check data integrity - value MUST be in appropriate range for property
            data_val =  nedm.val_norm
            err_val = nedm.err_norm
            n_val = nedm.n
            note_val = nedm.ephys_concept_map.note
            if check_data_val_range(data_val, e):
                output_ephys_name = e.short_name
                output_ephys_err_name = '%s_err' % output_ephys_name
                output_ephys_sd_name = '%s_sd' % output_ephys_name
                output_ephys_n_name = '%s_n' % output_ephys_name
                output_ephys_note_name = '%s_note' % output_ephys_name
                temp_dict[output_ephys_name] = data_val
                temp_dict[output_ephys_err_name] = err_val
                temp_dict[output_ephys_n_name] = n_val
                temp_dict[output_ephys_note_name] = note_val

                # do converting to standard dev from standard error if needed
                if sd_errors:
                    temp_dict[output_ephys_sd_name] = err_val
                else:
                    # need to calculate sd
                    if err_val and n_val:
                        sd_val = err_val * np.sqrt(n_val)
                        temp_dict[output_ephys_sd_name] = sd_val

            #temp_metadata_list.append(nedm.get_metadata())

        temp_dict['NeuronName'] =  ncm.neuron.name
        temp_dict['NeuronLongName'] =  ncm.neuron_long_name
        if ncm.neuron_long_name:
            temp_dict['NeuronPrefName'] = ncm.neuron_long_name
        else:
            temp_dict['NeuronPrefName'] = ncm.neuron.name
        article = ncm.get_article()

        brain_reg_dict = get_neuron_region(ncm.neuron)
        if brain_reg_dict:
            temp_dict['BrainRegion'] = brain_reg_dict['region_name']

        #article_metadata = normalize_metadata(article)

        metadata_list = nedm.get_metadata()
        out_dict = dict()
        for metadata in metadata_list:
            #print metadata.name
            if not metadata.cont_value:
                if metadata.name in out_dict:
                    out_dict[metadata.name] = '%s, %s' % (out_dict[metadata.name], metadata.value)
                else:
                    out_dict[metadata.name] = metadata.value
            elif metadata.cont_value and 'Solution' in metadata.name:
                article = nedm.get_article()
                amdm = m.ArticleMetaDataMap.objects.filter(article = article, metadata__name = metadata.name)[0]
                ref_text = amdm.ref_text
                out_dict[metadata.name] = ref_text.text.encode('utf8', "replace")
                out_dict[metadata.name + '_conf'] = metadata.cont_value.mean
            elif metadata.cont_value and 'AnimalAge' in metadata.name:
                # return geometric mean of age ranges, not arithmetic mean
                if metadata.cont_value.min_range and metadata.cont_value.max_range:
                    min_range = metadata.cont_value.min_range
                    max_range = metadata.cont_value.max_range
                    if min_range <= 0:
                        min_range = 1
                    geom_mean = np.sqrt(min_range * max_range)
                    out_dict[metadata.name] = geom_mean
                else:
                    out_dict[metadata.name] = metadata.cont_value.mean
            else:
                out_dict[metadata.name] = metadata.cont_value.mean

        # has article metadata been curated by a human?
        afts = article.get_full_text_stat()
        if afts and afts.metadata_human_assigned:
            metadata_curated = True
            metadata_curation_note = afts.metadata_curation_note
        else:
            metadata_curated = False
            metadata_curation_note = None

        if ncm.source.data_table:
            data_table_note = ncm.source.data_table.note
        else:
            data_table_note = None

        temp_dict2 = temp_dict.copy()
        temp_dict2.update(out_dict)
        temp_dict = temp_dict2
        temp_dict['Title'] = article.title
        temp_dict['Pmid'] = article.pmid
        temp_dict['PubYear'] = article.pub_year
        temp_dict['LastAuthor'] = unicode(get_article_last_author(article))
        temp_dict['TableID'] = ncm.source.data_table_id
        temp_dict['TableNote'] = data_table_note
        temp_dict['ArticleID'] = article.pk
        temp_dict['MetadataCurated'] = metadata_curated
        temp_dict['MetadataNote'] = metadata_curation_note
        #print temp_dict
        dict_list.append(temp_dict)

    base_names = ['Title', 'Pmid', 'PubYear', 'LastAuthor', 'ArticleID', 'TableID',
                  'NeuronName', 'NeuronLongName', 'NeuronPrefName', 'BrainRegion']
    nom_vars = ['MetadataCurated', 'Species', 'Strain', 'ElectrodeType', 'PrepType', 'JxnPotential']
    cont_vars  = ['JxnOffset', 'RecTemp', 'AnimalAge', 'AnimalWeight', 'FlagSoln']
    annot_notes = ['MetadataNote', 'TableNote']

    for i in range(0, 1):
        cont_vars.extend([ 'ExternalSolution', 'ExternalSolution_conf', 'external_%s_Mg' % i, 'external_%s_Ca' % i, 'external_%s_Na' % i, 'external_%s_Cl' % i, 'external_%s_K' % i, 'external_%s_pH' % i, 'InternalSolution', 'InternalSolution_conf', 'internal_%s_Mg' % i, 'internal_%s_Ca' % i, 'internal_%s_Na' % i, 'internal_%s_Cl' % i, 'internal_%s_K' % i, 'internal_%s_pH' % i])
        #cont_var_headers.extend(['External_%s_Mg' % i, 'External_%s_Ca' % i, 'External_%s_Na' % i, 'External_%s_Cl' % i, 'External_%s_K' % i, 'External_%s_pH' % i, 'External_%s_text' % i, 'Internal_%s_Mg' % i, 'Internal_%s_Ca' % i, 'Internal_%s_Na' % i, 'Internal_%s_Cl' % i, 'Internal_%s_K' % i, 'Internal_%s_pH' % i, 'Internal_%s_text' % i])

    col_names = base_names + nom_vars + cont_vars + annot_notes + ephys_names

    # set up pandas data frame for export
    df = pd.DataFrame(dict_list, columns = col_names)

    # perform collapsing of rows about same neuron types but potentially across different tables
    cleaned_df = df
    # need to generate a random int for coercing NaN's to something - required for pandas grouping
    rand_int = -abs(np.random.randint(20000))
    cleaned_df.loc[:, 'Pmid':'FlagSoln'] = df.loc[:, 'Pmid':'FlagSoln'].fillna(rand_int)
    grouping_fields = base_names + nom_vars + cont_vars
    grouping_fields.remove('TableID')
    cleaned_df.groupby(by = grouping_fields).mean()
    cleaned_df.replace(to_replace = rand_int, value = np.nan, inplace=True)
    cleaned_df.reset_index(inplace=True)
    cleaned_df.sort_values(by = ['PubYear', 'Pmid', 'NeuronName'], ascending=[False, True, True], inplace=True)
    cleaned_df.index.name = "Index"

    # add in extra ephys data from columns based on known relationships, e.g., AP amp from AP peak and AP thr
    cleaned_df = add_ephys_props_by_conversion(cleaned_df)

    return cleaned_df
def getAllArticleNedmMetadataSummary(getAllMetadata=False):
    """The old function for exporting the DB to a csv file, added here for reference"""
    # TODO: uncomment and remove unnecessary metadata
    #     articles = m.Article.objects.filter(Q(datatable__datasource__neuronconceptmap__times_validated__gte = 1) |
    #         Q(usersubmission__datasource__neuronconceptmap__times_validated__gte = 1)).distinct()
    #     articles = articles.filter(articlefulltext__articlefulltextstat__metadata_human_assigned = True ).distinct()
    articles = m.Article.objects.all()

    nom_vars = ["Species", "Strain", "ElectrodeType", "PrepType", "JxnPotential"]
    cont_vars = ["JxnOffset", "RecTemp", "AnimalAge", "AnimalWeight", "FlagSoln"]
    cont_var_headers = ["JxnOffset", "Temp", "Age", "Weight", "FlagSoln"]
    if getAllMetadata:
        for i in range(0, 5):
            cont_vars.extend(
                [
                    "external_%s_Mg" % i,
                    "external_%s_Ca" % i,
                    "external_%s_Na" % i,
                    "external_%s_Cl" % i,
                    "external_%s_K" % i,
                    "external_%s_pH" % i,
                    "external_%s_text" % i,
                    "internal_%s_Mg" % i,
                    "internal_%s_Ca" % i,
                    "internal_%s_Na" % i,
                    "internal_%s_Cl" % i,
                    "internal_%s_K" % i,
                    "internal_%s_pH" % i,
                    "internal_%s_text" % i,
                ]
            )
            cont_var_headers.extend(
                [
                    "External_%s_Mg" % i,
                    "External_%s_Ca" % i,
                    "External_%s_Na" % i,
                    "External_%s_Cl" % i,
                    "External_%s_K" % i,
                    "External_%s_pH" % i,
                    "External_%s_text" % i,
                    "Internal_%s_Mg" % i,
                    "Internal_%s_Ca" % i,
                    "Internal_%s_Na" % i,
                    "Internal_%s_Cl" % i,
                    "Internal_%s_K" % i,
                    "Internal_%s_pH" % i,
                    "Internal_%s_text" % i,
                ]
            )
    num_nom_vars = len(nom_vars)
    ephys_use_pks = range(1, 28)

    ephys_list = m.EphysProp.objects.filter(pk__in=ephys_use_pks)
    ephys_headers = []
    for e in ephys_list:
        ephys_name_str = re.sub("[\s-]", "", e.name.title())
        ephys_headers.append(ephys_name_str)

    csvout = csv.writer(
        open(settings.OUTPUT_FILES_DIRECTORY + "article_ephys_metadata_curated.csv", "w+b"), delimiter="\t"
    )

    other_headers = [
        "NeuronType",
        "Title",
        "Journal",
        "PubYear",
        "PubmedLink",
        "DataTableLinks",
        "ArticleDataLink",
        "LastAuthor",
    ]
    all_headers = other_headers
    all_headers.extend(ephys_headers)
    all_headers.extend(nom_vars + cont_var_headers)

    pubmed_base_link_str = "http://www.ncbi.nlm.nih.gov/pubmed/%d/"
    table_base_link_str = "http://neuroelectro.org/data_table/%d/"
    article_base_link_str = "http://neuroelectro.org/article/%d/"

    csvout.writerow(all_headers)
    for a in articles:
        print "processing metadata for article: %s" % a.pk
        amdms = m.ArticleMetaDataMap.objects.filter(article=a)
        curr_metadata_list = [""] * (len(nom_vars) + len(cont_vars))
        for i, v in enumerate(nom_vars):
            valid_vars = amdms.filter(metadata__name=v)
            temp_metadata_list = [vv.metadata.value for vv in valid_vars]
            if "in vitro" in temp_metadata_list and "cell culture" in temp_metadata_list:
                curr_metadata_list[i] = "cell culture"
            elif v == "Strain" and amdms.filter(metadata__value="Mice").count() > 0:
                temp_metadata_list = "C57BL"
                curr_metadata_list[i] = "C57BL"
            elif v == "Strain" and amdms.filter(metadata__value="Guinea Pigs").count() > 0:
                temp_metadata_list = "Guinea Pigs"
                curr_metadata_list[i] = "Guinea Pigs"
            elif len(temp_metadata_list) == 0 and v == "Strain":
                if amdms.filter(metadata__value="Rats").count() > 0:
                    if np.random.randn(1)[0] > 0:
                        curr_metadata_list[i] = "Sprague-Dawley"
                    else:
                        curr_metadata_list[i] = "Wistar"
            elif len(temp_metadata_list) > 1:
                temp_metadata_list = temp_metadata_list[0]
                curr_metadata_list[i] = temp_metadata_list
            else:
                curr_metadata_list[i] = u"; ".join(temp_metadata_list)
        for i, v in enumerate(cont_vars):
            valid_vars = amdms.filter(metadata__name=v)
            if valid_vars.count() > 0:
                cont_value_ob = valid_vars[0].metadata.cont_value.mean
                curr_metadata_list[i + num_nom_vars] = cont_value_ob
            else:
                # check if
                if v == "RecTemp" and amdms.filter(metadata__value="in vivo").count() > 0:
                    curr_metadata_list[i + num_nom_vars] = 37.0
                elif "text" in v and ("external" in v or "internal" in v):
                    for j in range(i - 6, i - 1, 1):
                        conc_amdm = amdms.filter(metadata__name=cont_vars[j])
                        if len(conc_amdm) > 0:
                            curr_metadata_list[i + num_nom_vars] = conc_amdm[0].metadata.ref_text.text.encode(
                                "utf8", "replace"
                            )
                            break
                        else:
                            curr_metadata_list[i + num_nom_vars] = "NaN"
                else:
                    curr_metadata_list[i + num_nom_vars] = "NaN"

        # TODO: uncomment these 2 lines
        neurons = m.Neuron.objects.filter(
            Q(neuronconceptmap__times_validated__gte=1)
            & (
                Q(neuronconceptmap__source__data_table__article=a)
                | Q(neuronconceptmap__source__user_submission__article=a)
            )
        ).distinct()
        neurons = m.Neuron.objects.filter(
            Q(neuronconceptmap__source__data_table__article=a) | Q(neuronconceptmap__source__user_submission__article=a)
        ).distinct()

        pubmed_link_str = pubmed_base_link_str % a.pmid
        article_link_str = article_base_link_str % a.pk
        dts = m.DataTable.objects.filter(article=a, datasource__neuronconceptmap__times_validated__gte=1).distinct()
        if dts.count() > 0:
            dt_link_list = [table_base_link_str % dt.pk for dt in dts]
            dt_link_str = u"; ".join(dt_link_list)
        else:
            dt_link_str = ""

        # grandfather = define_ephys_grandfather(a)
        # grandfather = None
        # if grandfather is not None:
        #     grandfather_name = grandfather.lastname
        #     grandfather_name = grandfather_name.encode("iso-8859-15", "replace")
        # else:
        #     grandfather_name = ''
        last_author = get_article_last_author(a)
        if last_author is not None:
            last_author_name = "%s %s" % (last_author.last, last_author.initials)
            last_author_name = last_author_name.encode("utf8", "replace")
            # if grandfather_name is '':
            #     neuro_tree_node = get_neurotree_author(last_author)
            #     if neuro_tree_node is None:
            #         grandfather_name = 'Node not found'
        else:
            last_author_name = ""

        for n in neurons:
            curr_ephys_prop_list = []

            curr_ephys_prop_list.append(n.name)
            curr_ephys_prop_list.append((a.title).encode("utf8", "replace"))
            curr_ephys_prop_list.append(a.journal)
            curr_ephys_prop_list.append(a.pub_year)
            curr_ephys_prop_list.append(pubmed_link_str)
            curr_ephys_prop_list.append(dt_link_str)
            curr_ephys_prop_list.append(article_link_str)
            curr_ephys_prop_list.append(last_author_name)

            for e in ephys_list:
                curr_ephys_prop_list.append(computeArticleNedmSummary(a.pmid, n, e))

            curr_ephys_prop_list.extend(curr_metadata_list)
            # curr_ephys_prop_list.append(grandfather_name)

            csvout.writerow(curr_ephys_prop_list)
    return articles
def export_db_to_data_frame():
    """Returns a nicely formatted pandas data frame of the ephys data and metadata for each stored article"""

    ncms = (
        m.NeuronConceptMap.objects.all()
    )  # .order_by('-history__latest__history_date') # gets human-validated neuron mappings
    # ncms = ncms.exclude(Q(source__data_table__irrelevant_flag = True) | Q(source__data_table__needs_expert = True)) # exclude
    ncms = ncms.exclude(Q(source__data_table__irrelevant_flag=True))  # exclude

    ncm_count = ncms.count()
    ephys_props = m.EphysProp.objects.all().order_by("-ephyspropsummary__num_neurons")
    ephys_names = []
    for e in ephys_props:
        ephys_names.append(e.short_name)
        ephys_names.append(e.short_name + "_raw")
        ephys_names.append(e.short_name + "_err")
        ephys_names.append(e.short_name + "_n")
        ephys_names.append(e.short_name + "_sd")
        ephys_names.append(e.short_name + "_note")
    # ephys_names = [e.name for e in ephys_props]
    # ncms = ncms.sort('-changed_on')
    dict_list = []
    for kk, ncm in enumerate(ncms):
        prog(kk, ncm_count)

        # TODO: need to check whether nedms under the same ncm have different experimental factor concept maps
        #     # check if any nedms have any experimental factors assoc with them
        #     efcms = ne_db.ExpFactConceptMap.objects.filter(neuronephysdatamap__in = nedms)
        #     for efcm in efcms:
        #         nedms = ne_db.NeuronEphysDataMap.objects.filter(neuron_concept_map = ncm, exp_fact_concept_map = ).distinct()

        # only check whether ncms have been expertly validated, not the nedm itself
        nedms = m.NeuronEphysDataMap.objects.filter(
            neuron_concept_map=ncm, neuron_concept_map__expert_validated=True
        ).distinct()
        if nedms.count() == 0:
            continue

        temp_dict = dict()
        temp_metadata_list = []
        for nedm in nedms:
            e = nedm.ephys_concept_map.ephys_prop

            # get error type for nedm by db lookup
            error_type = nedm.get_error_type()

            # check data integrity - value MUST be in appropriate range for property
            data_val = nedm.val_norm
            data_raw_val = nedm.val
            err_val = nedm.err_norm
            n_val = nedm.n
            note_val = nedm.ephys_concept_map.note
            output_ephys_name = e.short_name
            output_ephys_raw_name = "%s_raw" % output_ephys_name
            output_ephys_err_name = "%s_err" % output_ephys_name
            output_ephys_sem_name = "%s_sem" % output_ephys_name
            output_ephys_sd_name = "%s_sd" % output_ephys_name
            output_ephys_n_name = "%s_n" % output_ephys_name
            output_ephys_note_name = "%s_note" % output_ephys_name

            # output raw vals and notes for all props
            temp_dict[output_ephys_raw_name] = data_raw_val
            temp_dict[output_ephys_note_name] = note_val

            if check_data_val_range(data_val, e):

                temp_dict[output_ephys_name] = data_val
                temp_dict[output_ephys_err_name] = err_val
                temp_dict[output_ephys_n_name] = n_val

                # do converting to standard dev from standard error if needed
                if error_type == "sd":
                    temp_dict[output_ephys_sd_name] = err_val
                else:
                    # need to calculate sd
                    if err_val and n_val:
                        sd_val = err_val * np.sqrt(n_val)
                        temp_dict[output_ephys_sd_name] = sd_val

            # temp_metadata_list.append(nedm.get_metadata())

        temp_dict["NeuronName"] = ncm.neuron.name
        temp_dict["NeuronLongName"] = ncm.neuron_long_name
        if ncm.neuron_long_name:
            temp_dict["NeuronPrefName"] = ncm.neuron_long_name
        else:
            temp_dict["NeuronPrefName"] = ncm.neuron.name
        temp_dict["NeuroNERAnnots"] = ncm.get_neuroner()
        article = ncm.get_article()

        brain_reg_dict = get_neuron_region(ncm.neuron)
        if brain_reg_dict:
            temp_dict["BrainRegion"] = brain_reg_dict["region_name"]

        # article_metadata = normalize_metadata(article)

        metadata_list = nedm.get_metadata()
        out_dict = dict()
        for metadata in metadata_list:
            # print metadata.name
            if not metadata.cont_value:
                if metadata.name in out_dict:
                    out_dict[metadata.name] = "%s, %s" % (out_dict[metadata.name], metadata.value)
                else:
                    out_dict[metadata.name] = metadata.value
                if metadata.name == "Strain":
                    out_dict["StrainNote"] = metadata.note
                if metadata.name == "Species":
                    out_dict["SpeciesNote"] = metadata.note
            elif metadata.cont_value and "Solution" in metadata.name:
                article = nedm.get_article()
                if metadata.ref_text:
                    ref_text = metadata.ref_text
                else:
                    amdm = m.ArticleMetaDataMap.objects.filter(article=article, metadata__name=metadata.name)[0]
                    ref_text = amdm.ref_text
                out_dict[metadata.name] = ref_text.text.encode("utf8", "replace")
                out_dict[metadata.name + "_conf"] = metadata.cont_value.mean
            elif metadata.cont_value and "AnimalAge" in metadata.name:
                # return geometric mean of age ranges, not arithmetic mean
                if metadata.cont_value.min_range and metadata.cont_value.max_range:
                    min_range = metadata.cont_value.min_range
                    max_range = metadata.cont_value.max_range
                    if min_range <= 0:
                        min_range = 1
                    geom_mean = np.sqrt(min_range * max_range)
                    out_dict[metadata.name] = geom_mean
                else:
                    out_dict[metadata.name] = metadata.cont_value.mean
            else:
                out_dict[metadata.name] = metadata.cont_value.mean

        # has article metadata been curated by a human?
        afts = article.get_full_text_stat()
        if afts and afts.metadata_human_assigned:
            metadata_curated = True
            metadata_curation_note = afts.metadata_curation_note
        else:
            metadata_curated = False
            metadata_curation_note = None

        if ncm.source.data_table:
            data_table_note = ncm.source.data_table.note
        else:
            data_table_note = None

        temp_dict2 = temp_dict.copy()
        temp_dict2.update(out_dict)
        temp_dict = temp_dict2
        temp_dict["Title"] = article.title
        temp_dict["Pmid"] = article.pmid
        temp_dict["PubYear"] = article.pub_year
        temp_dict["LastAuthor"] = unicode(get_article_last_author(article))
        temp_dict["FirstAuthor"] = unicode(get_article_author(article, 0))
        temp_dict["TableID"] = ncm.source.data_table_id
        temp_dict["TableNote"] = data_table_note
        temp_dict["ArticleID"] = article.pk
        temp_dict["MetadataCurated"] = metadata_curated
        temp_dict["MetadataNote"] = metadata_curation_note
        # print temp_dict
        dict_list.append(temp_dict)

    base_names = [
        "Title",
        "Pmid",
        "PubYear",
        "FirstAuthor",
        "LastAuthor",
        "ArticleID",
        "TableID",
        "NeuronName",
        "NeuronLongName",
        "NeuronPrefName",
        "NeuroNERAnnots",
        "BrainRegion",
    ]
    nom_vars = [
        "MetadataCurated",
        "Species",
        "SpeciesNote",
        "Strain",
        "StrainNote",
        "ElectrodeType",
        "PrepType",
        "JxnPotential",
    ]
    cont_vars = ["JxnOffset", "RecTemp", "AnimalAge", "AnimalWeight", "FlagSoln"]
    annot_notes = ["MetadataNote", "TableNote"]

    grouping_fields = base_names + nom_vars + cont_vars

    for i in range(0, 1):
        cont_vars.extend(
            [
                "ExternalSolution",
                "ExternalSolution_conf",
                "external_%s_Mg" % i,
                "external_%s_Ca" % i,
                "external_%s_Na" % i,
                "external_%s_Cl" % i,
                "external_%s_K" % i,
                "external_%s_pH" % i,
                "external_%s_Cs" % i,
                "external_%s_glucose" % i,
                "external_%s_HEPES" % i,
                "external_%s_EDTA" % i,
                "external_%s_EGTA" % i,
                "external_%s_BAPTA" % i,
                "external_%s_ATP" % i,
                "external_%s_GTP" % i,
                "external_%s_CNQX" % i,
                "external_%s_DNQX" % i,
                "external_%s_NBQX" % i,
                "external_%s_MK801" % i,
                "external_%s_DAPV" % i,
                "external_%s_CPP" % i,
                "external_%s_kynur" % i,
                "external_%s_BIC" % i,
                "external_%s_picro" % i,
                "external_%s_gabazine" % i,
                "external_%s_CGP" % i,
                "external_%s_strychnine" % i,
                "InternalSolution",
                "InternalSolution_conf",
                "internal_%s_Mg" % i,
                "internal_%s_Ca" % i,
                "internal_%s_Na" % i,
                "internal_%s_Cl" % i,
                "internal_%s_K" % i,
                "internal_%s_pH" % i,
                "internal_%s_Cs" % i,
                "internal_%s_glucose" % i,
                "internal_%s_HEPES" % i,
                "internal_%s_EDTA" % i,
                "internal_%s_EGTA" % i,
                "internal_%s_BAPTA" % i,
                "internal_%s_ATP" % i,
                "internal_%s_GTP" % i,
            ]
        )

    col_names = base_names + nom_vars + cont_vars + annot_notes + ephys_names

    # not sure why but writing and reading data frame seems to fix a problem with ephys property pooling fxn
    df = pd.DataFrame(dict_list, columns=col_names)
    df.to_csv("temp.csv", sep="\t", encoding="utf-8")
    df = pd.read_csv("temp.csv", sep="\t", index_col=0, header=0)

    # perform collapsing of rows about same neuron types but potentially across different tables
    # this should be optional if the goal is ephys recuration, not ephys reanalysis
    grouping_fields.remove("TableID")
    grouping_fields.remove("NeuroNERAnnots")
    cleaned_df = pool_ephys_props_across_tables(df, grouping_fields)

    # add in extra ephys data from columns based on known relationships, e.g., AP amp from AP peak and AP thr
    cleaned_df = add_ephys_props_by_conversion(cleaned_df)

    # returning 2 data frames, 1 with properties pooled and calculated based on algebra, 1 not
    return cleaned_df, df
def getAllArticleNedmMetadataSummary():
    
    articles = m.Article.objects.filter(Q(datatable__datasource__neuronconceptmap__times_validated__gte = 1) | 
        Q(usersubmission__datasource__neuronconceptmap__times_validated__gte = 1)).distinct()
    articles = articles.filter(articlefulltext__articlefulltextstat__metadata_human_assigned = True ).distinct()
    nom_vars = ['Species', 'Strain', 'ElectrodeType', 'PrepType', 'JxnPotential']
    cont_vars  = ['JxnOffset', 'RecTemp', 'AnimalAge', 'AnimalWeight']
    cont_var_headers = ['JxnOffset', 'Temp', 'Age', 'Weight']
    num_nom_vars = len(nom_vars)
    #ephys_use_pks = [2, 3, 4, 5, 6, 7]
    #ephys_headers = ['ir', 'rmp', 'tau', 'amp', 'hw', 'thresh']
    ephys_use_pks = range(1,28)

    ephys_list = m.EphysProp.objects.filter(pk__in = ephys_use_pks)
    ephys_headers = []
    for e in ephys_list:
        ephys_name_str = e.name
        ephys_name_str = ephys_name_str.title()
        ephys_name_str = ephys_name_str.replace(' ', '')
        ephys_name_str = ephys_name_str.replace('-', '')
        ephys_headers.append(ephys_name_str)

    #ephys_headers = [e.name for e in ephys_list]
#    metadata_table = []
#    metadata_table_nom = np.zeros([len(articles), len(nom_vars)])
#    metadata_table_nom = np.zeros([len(articles), len(cont_vars)])
    csvout = csv.writer(open("article_ephys_metadata_summary.csv", "wb"))
    
    
    #metadata_headers = ["Species", "Strain", "ElectrodeType", "PrepType", "Temp", "Age", "Weight"]
    metadata_headers = nom_vars + cont_var_headers
    other_headers = ['NeuronType', 'Title', 'PubYear', 'PubmedLink', 'DataTableLinks', 'ArticleDataLink', 'LastAuthor']
    all_headers = ephys_headers
    all_headers.extend(metadata_headers)
    all_headers.extend(other_headers)
    pubmed_base_link_str = 'http://www.ncbi.nlm.nih.gov/pubmed/%d/'
    table_base_link_str = 'http://neuroelectro.org/data_table/%d/'
    article_base_link_str = 'http://neuroelectro.org/article/%d/'

    csvout.writerow(all_headers)
    for j,a in enumerate(articles):
        amdms = m.ArticleMetaDataMap.objects.filter(article = a)
        curr_metadata_list = ['']*(len(nom_vars) + len(cont_vars))
        for i,v in enumerate(nom_vars):
            valid_vars = amdms.filter(metadata__name = v)
            temp_metadata_list = [vv.metadata.value for vv in valid_vars]
            if 'in vitro' in temp_metadata_list and 'cell culture' in temp_metadata_list:
                curr_metadata_list[i] = 'cell culture'
            elif v == 'Strain' and amdms.filter(metadata__value = 'Mice').count() > 0:
                 temp_metadata_list = 'C57BL'
                 curr_metadata_list[i] = 'C57BL'
            elif v == 'Strain' and amdms.filter(metadata__value = 'Guinea Pigs').count() > 0:
                 temp_metadata_list = 'Guinea Pigs'
                 curr_metadata_list[i] = 'Guinea Pigs'
            elif len(temp_metadata_list) == 0 and v == 'Strain':
                if amdms.filter(metadata__value = 'Rats').count() > 0:
                    if np.random.randn(1)[0] > 0:
                        curr_metadata_list[i] = 'Sprague-Dawley'
                    else:
                        curr_metadata_list[i] = 'Wistar'
            elif len(temp_metadata_list) > 1: 
                 temp_metadata_list = temp_metadata_list[0]
                 curr_metadata_list[i] = temp_metadata_list
            else:
                curr_metadata_list[i] = u'; '.join(temp_metadata_list)
        for i,v in enumerate(cont_vars):
            valid_vars = amdms.filter(metadata__name = v)
            if valid_vars.count() > 0:
                cont_value_ob = valid_vars[0].metadata.cont_value.mean
    #                curr_str = cont_value_ob
                curr_metadata_list[i+num_nom_vars] = cont_value_ob
            else:
                # check if 
                if v == 'RecTemp' and amdms.filter(metadata__value = 'in vivo').count() > 0:
                    curr_metadata_list[i+num_nom_vars] = 37.0
                else:
                    curr_metadata_list[i+num_nom_vars] = 'NaN'
                    
        neurons = m.Neuron.objects.filter(Q(neuronconceptmap__times_validated__gte = 1) & 
            ( Q(neuronconceptmap__source__data_table__article = a) | Q(neuronconceptmap__source__user_submission__article = a))).distinct()
            
        
        pmid = a.pmid    
        pubmed_link_str = pubmed_base_link_str % a.pmid
        article_link_str = article_base_link_str % a.pk
        dts = m.DataTable.objects.filter(article = a, datasource__neuronconceptmap__times_validated__gte = 1).distinct()
        if dts.count() > 0:
            dt_link_list = [table_base_link_str % dt.pk for dt in dts] 
            dt_link_str = u'; '.join(dt_link_list)
        else:
            dt_link_str = ''  
        
        #grandfather = define_ephys_grandfather(a)   
        # grandfather = None
        # if grandfather is not None:
        #     grandfather_name = grandfather.lastname
        #     grandfather_name = grandfather_name.encode("iso-8859-15", "replace")
        # else:
        #     grandfather_name = ''
        last_author = get_article_last_author(a)
        if last_author is not None:
            last_author_name = '%s %s' % (last_author.last, last_author.initials)
            last_author_name = last_author_name.encode("iso-8859-15", "replace")
            # if grandfather_name is '':
            #     neuro_tree_node = get_neurotree_author(last_author)
            #     if neuro_tree_node is None:
            #         grandfather_name = 'Node not found'
        else:
            last_author_name = ''
            
        for n in neurons:
            curr_ephys_prop_list = []
            for j,e in enumerate(ephys_list):
                curr_ephys_prop_list.append(computeArticleNedmSummary(pmid, n, e))
        
#            print curr_ephys_prop_list
            curr_ephys_prop_list.extend(curr_metadata_list)
            curr_ephys_prop_list.append(n.name)
            curr_ephys_prop_list.append((a.title).encode("iso-8859-15", "replace"))
            curr_ephys_prop_list.append(a.pub_year)
            curr_ephys_prop_list.append(pubmed_link_str)
            curr_ephys_prop_list.append(dt_link_str)
            curr_ephys_prop_list.append(article_link_str)
            curr_ephys_prop_list.append(last_author_name)
            #curr_ephys_prop_list.append(grandfather_name)
            csvout.writerow(curr_ephys_prop_list)
    return articles
Ejemplo n.º 9
0
def get_neurotree_authors():
    """
    Returns a list of NeuroTree author nodes corresponding to last authors 
    of NeuroElectro articles.
    Also returns statistics based on how many NeuroElectro authors had 
    corresponding entries in NeuroTree
    """
    
    q1 = Q(datatable__datasource__neuronconceptmap__times_validated__gte=1)
    q2 = Q(usersubmission__datasource__neuronconceptmap__times_validated__gte=1)
    articles = m.Article.objects.filter(q1 | q2).distinct()
        
    found_count = 0
    cant_resolve_count = 0
    cant_find_count = 0
    last_author_node_list = []
    for i,article in enumerate(articles):
        print i
#        print article
#        print article.author_list_str
        author_ob = get_article_last_author(article)
        if not author_ob:
            print 'Article %s does not have an author list string' % \
                    article.title
            cant_find_count += 1
            last_author_node_list.append(None)
            continue
        last_name = author_ob.last
        first_name = author_ob.first.split()[0]
        # get neurotree author object corresponding to pubmed author object
        a_node_query = t.Node.objects.filter(lastname = last_name)
        if a_node_query.count() > 0: # checks that 
            a_node_query = t.Node.objects.filter(lastname = last_name,
                                    firstname__icontains = first_name[0])
            if a_node_query.count() > 1:
                a_node_query = t.Node.objects.filter(lastname = last_name, 
                                            firstname__icontains = first_name)
                if a_node_query.count() > 1: 
                    print 'Author: %s, %s has too many identical nodes \
                           in NeuroTree' % (last_name, first_name)
                    cant_resolve_count += 1
                    last_author_node_list.append(None)
        if a_node_query.count() ==0:
            print 'Author: %s, %s not in NeuroTree' % \
                    (last_name, first_name)
            cant_find_count += 1
        if a_node_query.count() == 1:
            #################################################
            # author_node is author variable in neuro tree  #
            #################################################
            author_node = a_node_query[0]
            last_author_node_list.append(author_node)

            print 'Author: %s, found in NeuroTree' % author_node
            found_count += 1
        print 'a'
            
    authors = []
    none_count = 0
    duplicate_count = 0
    for author in last_author_node_list:
        if author not in authors:
            if author is not None:
                authors.append(author)
            else:
                none_count += 1
                found_count -= 1
        else:
            duplicate_count += 1
            found_count -= 1

    return (authors, found_count, cant_resolve_count, 
            cant_find_count, duplicate_count, none_count)