Ejemplo n.º 1
0
def update_terms(grant):
    terms = []
    if grant.title:
        terms += noun_phrases(grant.title.lower())
    if grant.abstract:
        terms += noun_phrases(grant.abstract)
    grant.terms = sdb_db.stringify_terms(terms)
Ejemplo n.º 2
0
def update_terms(grant):
    terms = []
    if grant.title:
        terms += noun_phrases(grant.title.lower())
    if grant.abstract:
        terms += noun_phrases(grant.abstract)
    grant.terms = sdb_db.stringify_terms(terms)
Ejemplo n.º 3
0
            # check to see if we've reached the end of a document tag
            if elem.tag in CATEGORIES:
                # store attribute info, do preprocessing if necessary
                title = data.get('title')
                year = data.get('year')
                author_names = data.get('author_names', [])
                journal_name = data.get('journal_name')
                conference_name = data.get('conference_name')

                # clear out attribute info, and write
                data = {}
                doc = db.Document(title=title, year=year)
                # if this item has a title, memoize the terms and check if it's
                # clean (aka usable)
                if title != None:
                    doc.terms = ','.join([' '.join(phrase) for phrase in noun_phrases(preprocess(title))])
                    doc.clean = ok_title(title)
                else:
                # doc doesn't have a title, so mark it as unusable
                    doc.clean = False
                # take care of authors and journal
                for author_name in author_names:
                    doc.authors.append(memoized_row(db.Author, author_memo, author_name))
                if journal_name != None:
                    doc.journal = memoized_row(db.Journal, journal_memo, journal_name)
                if conference_name != None:
                    doc.conference = memoized_row(db.Conference, conference_memo, conference_name)

                session.add(doc)
                count += 1
                # commit changes periodically
Ejemplo n.º 4
0
                # store attribute info, do preprocessing if necessary
                title = data.get('title')
                year = data.get('year')
                author_names = data.get('author_names', [])
                journal_name = data.get('journal_name')
                conference_name = data.get('conference_name')

                # clear out attribute info, and write
                data = {}
                doc = db.Document(title=title, year=year)
                # if this item has a title, memoize the terms and check if it's
                # clean (aka usable)
                if title != None:
                    doc.terms = ','.join([
                        ' '.join(phrase)
                        for phrase in noun_phrases(preprocess(title))
                    ])
                    doc.clean = ok_title(title)
                else:
                    # doc doesn't have a title, so mark it as unusable
                    doc.clean = False
                # take care of authors and journal
                for author_name in author_names:
                    doc.authors.append(
                        memoized_row(db.Author, author_memo, author_name))
                if journal_name != None:
                    doc.journal = memoized_row(db.Journal, journal_memo,
                                               journal_name)
                if conference_name != None:
                    doc.conference = memoized_row(db.Conference,
                                                  conference_memo,
Ejemplo n.º 5
0
import mocs_database as db
from chunking import noun_phrases
from build_dblp_database import ok_title
from database import ManagedSession


def preprocess(title):
    return title.lower()


if __name__ == "__main__":
    with ManagedSession() as session:
        query = session.query(db.Document)
        N = query.count()
        count = 0
        for record in db.sliced_query(query, session_to_write=session):
            count += 1
            if record.title:
                record.terms = ",".join([" ".join(phrase) for phrase in noun_phrases(preprocess(record.title))])
                record.clean = ok_title(record.title)
            else:
                record.clean = False
            if count % 1000 == 0:
                print "updated %s records (%.f%%)" % (count, float(count) * 100 / N)
    print "finished, updated %s records" % count
Ejemplo n.º 6
0
import mocs_database as db
from chunking import noun_phrases
from build_dblp_database import ok_title
from database import ManagedSession


def preprocess(title):
    return title.lower()


if __name__ == "__main__":
    with ManagedSession() as session:
        query = session.query(db.Document)
        N = query.count()
        count = 0
        for record in db.sliced_query(query, session_to_write=session):
            count += 1
            if record.title:
                record.terms = ','.join([
                    ' '.join(phrase)
                    for phrase in noun_phrases(preprocess(record.title))
                ])
                record.clean = ok_title(record.title)
            else:
                record.clean = False
            if (count % 1000 == 0):
                print 'updated %s records (%.f%%)' % (count,
                                                      float(count) * 100 / N)
    print 'finished, updated %s records' % count