def add_table_ob_to_article(table_html, article_ob, text_mine = True, uploading_user = None): if uploading_user: user_uploaded = True else: user_uploaded = False table_soup = BeautifulSoup(table_html, 'lxml') table_html_cleaned = str(table_soup) table_html_cleaned = add_id_tags_to_table(table_html_cleaned) table_text = table_soup.get_text() table_text = table_text[0:min(9999,len(table_text))] data_table_ob = m.DataTable.objects.get_or_create(article = article_ob, table_html = table_html_cleaned, table_text = table_text, uploading_user = uploading_user, user_uploaded = user_uploaded )[0] data_table_ob = remove_spurious_table_headers(data_table_ob) # takes care of weird header thing for elsevier xml tables ds = m.DataSource.objects.get_or_create(data_table=data_table_ob)[0] # apply initial text mining of ephys concepts to table if text_mine: assocDataTableEphysVal(data_table_ob) # creates data table stat object, relevance is to count and store num of unique ecms that were TMed data_table_stat = update_data_table_stat(data_table_ob) return data_table_ob
def ephys_table_identify(): artObs = m.Article.objects.filter(datatable__isnull = False, articlefulltext__isnull = False).distinct() artObs = artObs.exclude(articlefulltext__articlefulltextstat__data_table_ephys_processed = True) dataTableObs = m.DataTable.objects.filter(article__in = artObs).distinct() num_tables = dataTableObs.count() print 'analyzing %s tables' % num_tables for i,dt in enumerate(dataTableObs): prog(i, num_tables) assocDataTableEphysVal(dt) art = dt.article aft_ob = art.get_full_text() if aft_ob is not None: aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = aft_ob)[0] aftStatOb.data_table_ephys_processed = True
def ephys_table_identify_block(pk_inds): dataTableObs = m.DataTable.objects.filter(pk__in = pk_inds).distinct() num_tables = dataTableObs.count() print 'analyzing %s tables in block' % num_tables for i,dt in enumerate(dataTableObs): #prog(i, num_tables) assocDataTableEphysVal(dt) art = dt.article print art aft_ob = art.get_full_text() if aft_ob is not None: aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = aft_ob)[0] aftStatOb.data_table_ephys_processed = True print i
html_tables = extract_tables_from_xml(aft.get_content(), file_name) else: html_tables = extract_tables_from_html(aft.get_content(), file_name) for table in html_tables: tableSoup = BeautifulSoup(table) table_html = str(tableSoup) table_html = add_id_tags_to_table(table_html) table_text = tableSoup.get_text() table_text = table_text[0:min(9999,len(table_text))] data_table_ob = m.DataTable.objects.get_or_create(article = a, table_html = table_html, table_text = table_text)[0] data_table_ob = remove_spurious_table_headers(data_table_ob) # takes care of weird header thing for elsevier xml tables ds = m.DataSource.objects.get_or_create(data_table=data_table_ob)[0] # apply initial text mining of ephys concepts to table assocDataTableEphysVal(data_table_ob) # text mine article level metadata apply_article_metadata(a) except Exception, e: with open('failed_files.txt', 'a') as f: f.write('%s\\%s' % (file_name, e)) print e print file_name finally: f.close() # if html_tables is not None: # do a check to see if tables already exist, if do, just return # if a.datatable_set.all().count() > 0: # return a