def update_terms(grant): terms = [] if grant.title: terms += noun_phrases(grant.title.lower()) if grant.abstract: terms += noun_phrases(grant.abstract) grant.terms = sdb_db.stringify_terms(terms)
# check to see if we've reached the end of a document tag if elem.tag in CATEGORIES: # store attribute info, do preprocessing if necessary title = data.get('title') year = data.get('year') author_names = data.get('author_names', []) journal_name = data.get('journal_name') conference_name = data.get('conference_name') # clear out attribute info, and write data = {} doc = db.Document(title=title, year=year) # if this item has a title, memoize the terms and check if it's # clean (aka usable) if title != None: doc.terms = ','.join([' '.join(phrase) for phrase in noun_phrases(preprocess(title))]) doc.clean = ok_title(title) else: # doc doesn't have a title, so mark it as unusable doc.clean = False # take care of authors and journal for author_name in author_names: doc.authors.append(memoized_row(db.Author, author_memo, author_name)) if journal_name != None: doc.journal = memoized_row(db.Journal, journal_memo, journal_name) if conference_name != None: doc.conference = memoized_row(db.Conference, conference_memo, conference_name) session.add(doc) count += 1 # commit changes periodically
# store attribute info, do preprocessing if necessary title = data.get('title') year = data.get('year') author_names = data.get('author_names', []) journal_name = data.get('journal_name') conference_name = data.get('conference_name') # clear out attribute info, and write data = {} doc = db.Document(title=title, year=year) # if this item has a title, memoize the terms and check if it's # clean (aka usable) if title != None: doc.terms = ','.join([ ' '.join(phrase) for phrase in noun_phrases(preprocess(title)) ]) doc.clean = ok_title(title) else: # doc doesn't have a title, so mark it as unusable doc.clean = False # take care of authors and journal for author_name in author_names: doc.authors.append( memoized_row(db.Author, author_memo, author_name)) if journal_name != None: doc.journal = memoized_row(db.Journal, journal_memo, journal_name) if conference_name != None: doc.conference = memoized_row(db.Conference, conference_memo,
import mocs_database as db from chunking import noun_phrases from build_dblp_database import ok_title from database import ManagedSession def preprocess(title): return title.lower() if __name__ == "__main__": with ManagedSession() as session: query = session.query(db.Document) N = query.count() count = 0 for record in db.sliced_query(query, session_to_write=session): count += 1 if record.title: record.terms = ",".join([" ".join(phrase) for phrase in noun_phrases(preprocess(record.title))]) record.clean = ok_title(record.title) else: record.clean = False if count % 1000 == 0: print "updated %s records (%.f%%)" % (count, float(count) * 100 / N) print "finished, updated %s records" % count
import mocs_database as db from chunking import noun_phrases from build_dblp_database import ok_title from database import ManagedSession def preprocess(title): return title.lower() if __name__ == "__main__": with ManagedSession() as session: query = session.query(db.Document) N = query.count() count = 0 for record in db.sliced_query(query, session_to_write=session): count += 1 if record.title: record.terms = ','.join([ ' '.join(phrase) for phrase in noun_phrases(preprocess(record.title)) ]) record.clean = ok_title(record.title) else: record.clean = False if (count % 1000 == 0): print 'updated %s records (%.f%%)' % (count, float(count) * 100 / N) print 'finished, updated %s records' % count