def save_root_categories_to_db(root_categories):
  db, cursor = init_db()
  table_name = 'root_categories'
  if not is_tbl_exists(db, cursor, table_name):
    cursor.execute('CREATE TABLE ' + table_name + \
        ' (id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \
        ' name VARCHAR(100) NOT NULL)')
  for root_category in root_categories:
    if not is_root_category_in_db(db, cursor, root_category[0]):
      try:
        cursor.execute("""INSERT INTO root_categories (name) VALUES (%s)""", (root_category[0],))
      except Exception, e:
        print repr(e)
def invalidate_stopwords_from_keyphrases(stopwords):
  db, cursor = init_db()
  if not is_tbl_exists(db, cursor, 'ngrams'):
    raise Exception('Table "ngrams" does not exist!')
  for stopword in stopwords:
    try:
      cursor.execute("""UPDATE ngrams SET is_valid=0 WHERE name=%s""", \
          (stopword))
      db.commit()
    except:
      db.rollback()
      raise Exception('Error in updating table "ngrams"')
  close_db(db, cursor)
def save_keyphrases_to_table(keyphrases):
  db, cursor = init_db()
  if not is_tbl_exists(db, cursor, 'ngrams'):
    cursor.execute('CREATE TABLE ngrams ' + \
                   '(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \
                   'name varchar(255) NOT NULL UNIQUE, ' + \
                   'n INT DEFAULT 0, ' + \
                   'freq INT DEFAULT 0, ' + \
                   'is_valid int(1) DEFAULT 1)')
  for k in keyphrases:
    ngram_id = get_ngram_id_by_name(db, cursor, k)
    if ngram_id == None:
      save_one_keyphrase_to_table(db, cursor, k)
  close_db(db, cursor)
def gen_author_keyphrase(trie):
  # ex: Assuming sean's author cluster is 203, sean is good at nuclear (id = 2111) and oxygen (id=76),
  #     then: author_keyphrase[203] = {2111: 3, 76: 5}, where 3 and 5 are the appearance frequency
  #     of nuclear and oxygen in his publications
  batch_save_size = 100
  db, cursor = init_db()

  if not is_tbl_exists(db, cursor, 'personal_keywords'):
    cursor.execute('CREATE TABLE personal_keywords (' + \
                   'id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \
                   'person_cluster int, ' + \
                   'ngram_id int, ' + \
                   'year int, ' + \
                   'count int, ' + \
                   'log_cite_prod_count float)')

  author_clusters = get_author_clusters()
  author_keyphrase = { }
  sys.stdout.write("Generating author_keyphrase\n")
  n_author_clusters = len(author_clusters)
  for i, author_cluster in enumerate(author_clusters):
    sys.stdout.write("\r%d / %d" % (i+1, n_author_clusters))
    cursor.execute("""SELECT authors.cluster, papers.title, """ + \
        """papers.abstract, papers.ncites FROM authors, papers WHERE """ + \
        """authors.paper_cluster = papers.cluster and authors.cluster = %s""", (author_cluster))
    rows = cursor.fetchall()
    for r in rows:
      author_cluster = r[0]
      contents = r[1].lower() + ' >>> ' if r[1] is not None else ''
      if r[2] is not None:
        contents += r[2].lower()
      ncites = r[3]
      term_ctr = gen_term_ctr(contents, trie)
      upd_author_keyphrase(author_keyphrase, author_cluster, term_ctr, ncites)
    if (i+1) % batch_save_size == 0:
      save_author_keyphrase_to_table(db, cursor, author_keyphrase)
      author_keyphrase = { }
  save_author_keyphrase_to_table(db, cursor, author_keyphrase)
  sys.stdout.write("\nCreating indexes...\n")
  cursor.execute('ALTER TABLE personal_keywords ADD INDEX (person_cluster), ' + \
                 'ADD INDEX (ngram_id), ADD INDEX (year)')
  close_db(db, cursor)
def gen_keyphrase_info(trie):
  db, cursor = init_db()
  if not is_tbl_exists(db, cursor, 'ngram_relations'):
    cursor.execute('CREATE TABLE ngram_relations '  + \
        '(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \
        'src_id INT NOT NULL, ' + \
        'tar_id INT NOT NULL, ' + \
        'co_occur INT DEFAULT 0, ' + \
        'co_occur_norm FLOAT DEFAULT 0, ' + \
        'is_valid INT(1) DEFAULT 1, ' + \
        'UNIQUE src_tar_idx (src_id, tar_id))')

  batch_save_size = 100
  keyphrase_ctr = defaultdict(int)
  keyphrase_relation_ctr = defaultdict(lambda : defaultdict(int))
  cursor.execute("""SELECT title, abstract FROM papers""")
  rows = cursor.fetchall()
  num_rows = len(rows)
  sys.stdout.write('Generating keyphrase information\n')
  for i, r in enumerate(rows):
    sys.stdout.write("\r%d / %d" % (i+1, num_rows))
    contents = r[0].lower() + ' >>> ' if r[0] is not None else ''
    if r[1] is not None:
      contents += r[1].lower()
    inc_keyphrase_ctr(keyphrase_ctr, contents, trie)
    inc_keyphrase_relation_ctr(keyphrase_relation_ctr, contents, trie)
    if (i+1) % batch_save_size == 0:
      upd_keyphrase_ctr_to_table(db, cursor, keyphrase_ctr)
      upd_keyphrase_relation_ctr_to_table(db, cursor, keyphrase_relation_ctr)
      keyphrase_ctr = defaultdict(int)
      keyphrase_relation_ctr = defaultdict(lambda : defaultdict(int))
  upd_keyphrase_ctr_to_table(db, cursor, keyphrase_ctr)
  upd_keyphrase_relation_ctr_to_table(db, cursor, keyphrase_relation_ctr)
  upd_co_occur_norm()
  sys.stdout.write("\n")
  #cursor.execute('ALTER TABLE ngram_relations ADD UNIQUE INDEX (src_id, tar_id)')
  close_db(db, cursor)
Beispiel #6
0
def check_required_tables(db, cursor):
  required_tbls = ['authors', 'papers', 'ngrams']
  for tbl in required_tbls:
    if not is_tbl_exists(db, cursor, tbl):
      raise Exception('Table "' + tbl + '" does not exist in the specified table')