def ephys_table_identify(): artObs = m.Article.objects.filter(datatable__isnull=False, articlefulltext__isnull=False).distinct() artObs = artObs.exclude(articlefulltext__articlefulltextstat__data_table_ephys_processed=True) dataTableObs = m.DataTable.objects.filter(article__in=artObs).distinct() num_tables = dataTableObs.count() print "analyzing %s tables" % num_tables for i, dt in enumerate(dataTableObs): prog(i, num_tables) assocDataTableEphysVal(dt) art = dt.article aft_ob = art.get_full_text() if aft_ob is not None: aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text=aft_ob)[0] aftStatOb.data_table_ephys_processed = True aftStatOb.save()
tableSoup = BeautifulSoup(table) table_html = str(tableSoup) table_html = add_id_tags_to_table(table_html) table_text = tableSoup.get_text() table_text = table_text[0 : min(9999, len(table_text))] data_table_ob = m.DataTable.objects.get_or_create(article=a, table_html=table_html, table_text=table_text)[ 0 ] data_table_ob = addIdsToTable(data_table_ob) # add table id elements if there aren't any data_table_ob = remove_spurious_table_headers( data_table_ob ) # takes care of weird header thing for elsevier xml tables ds = m.DataSource.objects.get_or_create(data_table=data_table_ob)[0] # apply initial text mining of ephys concepts to table assocDataTableEphysVal(data_table_ob) # text mine article level metadata apply_article_metadata(a) except Exception, e: with open("failed_files.txt", "a") as f: f.write("%s\\%s" % (file_name, e)) print e print file_name finally: f.close() # if html_tables is not None: # do a check to see if tables already exist, if do, just return # if a.datatable_set.all().count() > 0: # return a