def add_table_ob_to_article(table_html, article_ob, text_mine = True, uploading_user = None):
    if uploading_user:
        user_uploaded = True
    else:
        user_uploaded = False
    table_soup = BeautifulSoup(table_html, 'lxml')
    table_html_cleaned = str(table_soup)
    table_html_cleaned = add_id_tags_to_table(table_html_cleaned)
    table_text = table_soup.get_text()
    table_text = table_text[0:min(9999,len(table_text))]
    data_table_ob = m.DataTable.objects.get_or_create(article = article_ob,
                                                      table_html = table_html_cleaned,
                                                      table_text = table_text,
                                                      uploading_user = uploading_user,
                                                      user_uploaded = user_uploaded
                                                      )[0]
    data_table_ob = remove_spurious_table_headers(data_table_ob) # takes care of weird header thing for elsevier xml tables

    ds = m.DataSource.objects.get_or_create(data_table=data_table_ob)[0]

    # apply initial text mining of ephys concepts to table
    if text_mine:
        assocDataTableEphysVal(data_table_ob)
        # creates data table stat object, relevance is to count and store num of unique ecms that were TMed
        data_table_stat = update_data_table_stat(data_table_ob)

    return data_table_ob
def ephys_table_identify():
    artObs = m.Article.objects.filter(datatable__isnull = False, articlefulltext__isnull = False).distinct()
    artObs = artObs.exclude(articlefulltext__articlefulltextstat__data_table_ephys_processed = True)
    dataTableObs = m.DataTable.objects.filter(article__in = artObs).distinct()
    num_tables = dataTableObs.count()
    print 'analyzing %s tables' % num_tables
    for i,dt in enumerate(dataTableObs):    
        prog(i, num_tables)
        assocDataTableEphysVal(dt)     
        art = dt.article
        aft_ob = art.get_full_text()
        if aft_ob is not None:
            aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = aft_ob)[0]
            aftStatOb.data_table_ephys_processed = True
            aftStatOb.save()
def ephys_table_identify_block(pk_inds):
   dataTableObs = m.DataTable.objects.filter(pk__in = pk_inds).distinct()
   num_tables = dataTableObs.count()
   print 'analyzing %s tables in block' % num_tables
   for i,dt in enumerate(dataTableObs):
       #prog(i, num_tables)
       assocDataTableEphysVal(dt)
       art = dt.article
       print art
       aft_ob = art.get_full_text()
       if aft_ob is not None:
           aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = aft_ob)[0]
           aftStatOb.data_table_ephys_processed = True
           aftStatOb.save()
           print i
            html_tables = extract_tables_from_xml(aft.get_content(), file_name)
        else:
            html_tables = extract_tables_from_html(aft.get_content(), file_name)
        
        for table in html_tables:
            tableSoup = BeautifulSoup(table)
            table_html = str(tableSoup)
            table_html = add_id_tags_to_table(table_html)
            table_text = tableSoup.get_text()
            table_text = table_text[0:min(9999,len(table_text))]
            data_table_ob = m.DataTable.objects.get_or_create(article = a, table_html = table_html, table_text = table_text)[0]
            data_table_ob = remove_spurious_table_headers(data_table_ob) # takes care of weird header thing for elsevier xml tables
            ds = m.DataSource.objects.get_or_create(data_table=data_table_ob)[0]    
            
            # apply initial text mining of ephys concepts to table
            assocDataTableEphysVal(data_table_ob)
            
        # text mine article level metadata
        apply_article_metadata(a)

    except Exception, e:
        with open('failed_files.txt', 'a') as f:
            f.write('%s\\%s' % (file_name, e))
        print e
        print file_name
    finally:
        f.close()
#     if html_tables is not None:
        # do a check to see if tables already exist, if do, just return
#         if a.datatable_set.all().count() > 0:
#             return a