def dow_xml_to_mysql():
  xmlschema_doc = etree.parse(FLAGS.schema_file)
  xmlschema = etree.XMLSchema(xmlschema_doc)

  db, cursor = init_db()
  for root, sub_folders, files in os.walk(FLAGS.xml_dir):
    for file in files:
      # recursively find xml files
      if os.path.splitext(file)[1] == '.xml':
        tree = etree.parse(os.path.join(root, file))

        # check schema
        if not xmlschema.validate(tree):
          raise Exception("Incorrect schema of xml file " + os.path.join(root, file))

        # parse xml
        for doc in tree.xpath('dbo_RICKMETADATA'):
          (id, title, authors, abs, url) = ('', '', [], '', '')
          id = get_element_value(doc, 'ID')
          title = ' '.join(get_element_value(doc, 'TITL').splitlines()).strip()
          authors = clean_authors(';'.join(get_element_value(doc, 'AUTH').splitlines()).split(';'))
          abs = ' '.join(get_element_value(doc, 'ABST').splitlines()).strip()
          url = get_element_value(doc, 'LINK')

          # save to mysql
          if id != '':
            insert_tables(db, cursor, id, title, authors, abs, url)

  close_db(db, cursor)
Beispiel #2
0
 def get_personal_keyword_info(self, person_cluster, ngram_id):
   db, cursor = init_db()
   cursor.execute("""SELECT count, log_cite_prod_count FROM personal_keywords """ + \
                  """WHERE person_cluster = %s AND ngram_id = %s""", (person_cluster, ngram_id))
   row = cursor.fetchone()
   close_db(db, cursor)
   return row
Beispiel #3
0
def main(argv):
    NUM_PROCESSES = multiprocessing.cpu_count() * 2

    cmd_queue = multiprocessing.Queue()
    job_queue = multiprocessing.Queue()

    print "Create %d workers..." % NUM_PROCESSES
    for i in range(NUM_PROCESSES):
        multiprocessing.Process(target=worker,
                                args=(i, cmd_queue, job_queue)).start()

    # fetch data from database
    db, cursor = mysql_util.init_db()
    try:
        if not mysql_util.does_table_exist(db, cursor, 'paper_keywords_noun'):
            cursor.execute('CREATE TABLE paper_keywords_noun ('
                           'id INT(11) NOT NULL AUTO_INCREMENT PRIMARY KEY, '
                           'paper_id varchar(100), '
                           'ngram varchar(255), '
                           'count int(11), '
                           'INDEX (paper_id))')

        print "Fetching papers from database..."
        cursor.execute("""SELECT id, title, abstract FROM papers""")
        papers = cursor.fetchall()

        dispatch_work(cmd_queue, job_queue, papers)
    except KeyboardInterrupt as e:
        pass
    finally:
        mysql_util.close_db(db, cursor)
        print "Main process leaves"
def main(argv):
    NUM_PROCESSES = multiprocessing.cpu_count() * 2

    cmd_queue = multiprocessing.Queue()
    job_queue = multiprocessing.Queue()

    print "Create %d workers..." % NUM_PROCESSES
    for i in range(NUM_PROCESSES):
        multiprocessing.Process(target=worker, args=(i, cmd_queue, job_queue)).start()

    # fetch data from database
    db, cursor = mysql_util.init_db()
    try:
        if not mysql_util.does_table_exist(db, cursor, "paper_keywords_noun"):
            cursor.execute(
                "CREATE TABLE paper_keywords_noun ("
                "id INT(11) NOT NULL AUTO_INCREMENT PRIMARY KEY, "
                "paper_id varchar(100), "
                "ngram varchar(255), "
                "count int(11), "
                "INDEX (paper_id))"
            )

        print "Fetching papers from database..."
        cursor.execute("""SELECT id, title, abstract FROM papers""")
        papers = cursor.fetchall()

        dispatch_work(cmd_queue, job_queue, papers)
    except KeyboardInterrupt as e:
        pass
    finally:
        mysql_util.close_db(db, cursor)
        print "Main process leaves"
Beispiel #5
0
 def get_ngram_co_occur_info(self, min_co_occur=1):
   db, cursor = init_db()
   cursor.execute("""SELECT src_id, tar_id FROM ngram_relations """ + \
                  """WHERE co_occur >= %s""", (min_co_occur))
   rows = cursor.fetchall()
   close_db(db, cursor)
   return [(r[0], r[1]) for r in rows]
Beispiel #6
0
def drop_tables():
  print 'Dropping tables'
  db, cursor = mysql_util.init_db()
  tables_to_be_removed = mysql_util.get_all_tbl(db, cursor)
  for tbl in tables_to_be_removed:
    mysql_util.drop_tbl(db, cursor, tbl)
  mysql_util.close_db(db, cursor)
Beispiel #7
0
 def get_experts_from_keywords(self, keyword):
   db, cursor = init_db()
   cursor.execute("""SELECT authors.name FROM authors, ngrams, personal_keywords """ + \
                  """WHERE ngrams.name=%s AND personal_keywords.ngram_id = ngrams.id """ + \
                  """AND authors.cluster=personal_keywords.person_cluster """ + \
                  """GROUP BY authors.name""", (keyword))
   rows = cursor.fetchall()
   close_db(db, cursor)
   return [r[0] for r in rows]
Beispiel #8
0
  def get_expertise_from_author(self, author):
    db, cursor = init_db()
    cursor.execute("""SELECT ngrams.name FROM ngrams, personal_keywords, authors """ + \
                   """WHERE ngrams.id=personal_keywords.ngram_id AND """ + \
                   """personal_keywords.person_cluster = authors.cluster AND """ + \
                   """ngrams.is_valid AND authors.name=%s GROUP BY ngrams.name""", (author))
    rows = cursor.fetchall()
    close_db(db, cursor)

    return [r[0] for r in rows]
def get_author_clusters():
  db, cursor = init_db()

  author_clusters = [ ]
  cursor.execute("""SELECT distinct(cluster) FROM authors""")
  rows = cursor.fetchall()
  for r in rows:
    author_clusters.append(r[0])

  close_db(db, cursor)
  return author_clusters
Beispiel #10
0
def main(argv):
  check_args(argv)

  db, cursor = init_db()
  try:
    cursor.execute("""DELETE FROM paper_keywords_noun WHERE ngram=%s""", (FLAGS.drop_keyword))
    db.commit()
  except:
    sys.stdout.write("""Error in deleting paper_keywords_noun where ngram = %s""" % (FLAGS.drop_keyword))
    db.rollback()

  close_db(db, cursor)
def add_to_ngrams(user_defined_terms):
  db, cursor = init_db()
  for term in user_defined_terms:
    if not is_term_in_ngrams(db, cursor, term):
      try:
        cursor.execute("""INSERT INTO ngrams (name, n) VALUES (%s, %s)""", \
                       (term, len(term.split())))
        db.commit()
      except:
        sys.stderr.write('Error in inserting table ngrams "' + term + '"\n')
        db.rollback()
  close_db(db, cursor)
def save_root_categories_to_db(root_categories):
  db, cursor = init_db()
  table_name = 'root_categories'
  if not is_tbl_exists(db, cursor, table_name):
    cursor.execute('CREATE TABLE ' + table_name + \
        ' (id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \
        ' name VARCHAR(100) NOT NULL)')
  for root_category in root_categories:
    if not is_root_category_in_db(db, cursor, root_category[0]):
      try:
        cursor.execute("""INSERT INTO root_categories (name) VALUES (%s)""", (root_category[0],))
      except Exception, e:
        print repr(e)
def invalidate_stopwords_from_keyphrases(stopwords):
  db, cursor = init_db()
  if not is_tbl_exists(db, cursor, 'ngrams'):
    raise Exception('Table "ngrams" does not exist!')
  for stopword in stopwords:
    try:
      cursor.execute("""UPDATE ngrams SET is_valid=0 WHERE name=%s""", \
          (stopword))
      db.commit()
    except:
      db.rollback()
      raise Exception('Error in updating table "ngrams"')
  close_db(db, cursor)
def save_keyphrases_to_table(keyphrases):
  db, cursor = init_db()
  if not is_tbl_exists(db, cursor, 'ngrams'):
    cursor.execute('CREATE TABLE ngrams ' + \
                   '(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \
                   'name varchar(255) NOT NULL UNIQUE, ' + \
                   'n INT DEFAULT 0, ' + \
                   'freq INT DEFAULT 0, ' + \
                   'is_valid int(1) DEFAULT 1)')
  for k in keyphrases:
    ngram_id = get_ngram_id_by_name(db, cursor, k)
    if ngram_id == None:
      save_one_keyphrase_to_table(db, cursor, k)
  close_db(db, cursor)
def gen_trie():
  db, cursor = init_db()

  trie = { }
  cursor.execute("""SELECT id, name FROM ngrams WHERE is_valid = 1""")
  rows = cursor.fetchall()
  n_rows = len(rows)
  sys.stdout.write('Generating trie\n')
  for i, r in enumerate(rows):
    sys.stdout.write("\r%d / %d" % (i+1, n_rows))
    ngram_id = r[0]
    name = r[1]
    insert_trie(trie, ngram_id, name)
  sys.stdout.write('\n')

  close_db(db, cursor)
  return trie
def get_rand_papers(eval_paper_num):
  '''
  Return: list of tuples where each tuple contains
  (paper_cluster_id, paper_title, and paper_abstract)
  '''
  paper_infos = [ ]
  db, cursor = init_db()
  cursor.execute("""SELECT cluster, title, abstract FROM papers WHERE """ + \
                 """title IS NOT NULL AND title <> '' AND """ + \
                 """abstract IS NOT NULL AND abstract <> ''""")
  rows = cursor.fetchall()
  sampled_idx = random.sample(xrange(len(rows)), min(len(rows), eval_paper_num))
  for idx in sampled_idx:
    paper_infos.append((rows[idx]))

  close_db(db, cursor)
  return paper_infos
Beispiel #17
0
def create_solr_doc_files(table_name, field_mapping, solr_file_folder, solr_doc_filename_prefix):
    batchsize = 100000

    db, cursor = mysql_util.init_db()
    print 'Querying database'
    sql = 'SELECT %s FROM %s' % (','.join(field_mapping.keys()), table_name)

    cursor.execute(sql)
    num_files = 0
    while True:
        rows = cursor.fetchmany(batchsize)
        if not rows:
            break
        num_files += 1
        sys.stdout.write('Generating doc file number %d\n' % num_files)
        root = etree.Element('add')
        
        for i, r in enumerate(rows):
            sys.stdout.write("\r%d / %d" % (i+1, len(rows)))
            doc = etree.Element('doc')
            for j, col in enumerate(r):
                (field_name, field_type) = field_mapping.values()[j]
                ele = etree.Element("field", name=field_name)
                if field_type in ['string', 'text_general']:
                    assign_xml_node_text(ele, col) if col is not None else ''
                elif field_type in ['int', 'float', 'long', 'double']:
                    ele.text = str(col) if col is not None else '0'
                elif field_type in ['date']:
                    ele.text = col.strftime("%Y-%m-%dT%H:%M:%SZ") if col is not None else ''
                elif field_type in ['boolean']:
                    ele.text = 'true' if col else 'false'
                else:
                    raise Exception('\nUndefined field type "%s"' % (field_type))
                doc.append(ele)
            root.append(doc)
        print ''
        if not os.path.isdir(solr_file_folder):
            os.mkdir(solr_file_folder)
        f = open(os.path.join(solr_file_folder, '%s%d.xml' % (solr_doc_filename_prefix, num_files)), 'w')
        tree = etree.ElementTree(root)
        tree.write(f, encoding='utf8', pretty_print=True)
        f.close()

    mysql_util.close_db(db, cursor)
def upd_co_occur_norm():
  db, cursor = init_db()
  total_freq = float(get_total_freq(db, cursor))
  cursor.execute("""SELECT id, src_id, tar_id, co_occur FROM ngram_relations""")
  rows = cursor.fetchall()
  for r in rows:
    id = r[0]
    src_freq_prob = get_freq_by_ngram_id(db, cursor, r[1]) / total_freq
    tar_freq_prob = get_freq_by_ngram_id(db, cursor, r[2]) / total_freq
    co_occur_prob = r[3] / total_freq
    co_occur_norm = normalize(pmi(src_freq_prob, tar_freq_prob, co_occur_prob) / (-math.log(co_occur_prob)), -1, 1, 1, 5)
    try:
      cursor.execute("""UPDATE ngram_relations SET co_occur_norm = %s WHERE id = %s""",
          (co_occur_norm, id))
      db.commit()
    except:
      sys.stderr.write('Error in updating table ngrams of id "' + str(id) + '"\n')
      db.rollback()
  close_db(db, cursor)
def gen_author_keyphrase(trie):
  # ex: Assuming sean's author cluster is 203, sean is good at nuclear (id = 2111) and oxygen (id=76),
  #     then: author_keyphrase[203] = {2111: 3, 76: 5}, where 3 and 5 are the appearance frequency
  #     of nuclear and oxygen in his publications
  batch_save_size = 100
  db, cursor = init_db()

  if not is_tbl_exists(db, cursor, 'personal_keywords'):
    cursor.execute('CREATE TABLE personal_keywords (' + \
                   'id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \
                   'person_cluster int, ' + \
                   'ngram_id int, ' + \
                   'year int, ' + \
                   'count int, ' + \
                   'log_cite_prod_count float)')

  author_clusters = get_author_clusters()
  author_keyphrase = { }
  sys.stdout.write("Generating author_keyphrase\n")
  n_author_clusters = len(author_clusters)
  for i, author_cluster in enumerate(author_clusters):
    sys.stdout.write("\r%d / %d" % (i+1, n_author_clusters))
    cursor.execute("""SELECT authors.cluster, papers.title, """ + \
        """papers.abstract, papers.ncites FROM authors, papers WHERE """ + \
        """authors.paper_cluster = papers.cluster and authors.cluster = %s""", (author_cluster))
    rows = cursor.fetchall()
    for r in rows:
      author_cluster = r[0]
      contents = r[1].lower() + ' >>> ' if r[1] is not None else ''
      if r[2] is not None:
        contents += r[2].lower()
      ncites = r[3]
      term_ctr = gen_term_ctr(contents, trie)
      upd_author_keyphrase(author_keyphrase, author_cluster, term_ctr, ncites)
    if (i+1) % batch_save_size == 0:
      save_author_keyphrase_to_table(db, cursor, author_keyphrase)
      author_keyphrase = { }
  save_author_keyphrase_to_table(db, cursor, author_keyphrase)
  sys.stdout.write("\nCreating indexes...\n")
  cursor.execute('ALTER TABLE personal_keywords ADD INDEX (person_cluster), ' + \
                 'ADD INDEX (ngram_id), ADD INDEX (year)')
  close_db(db, cursor)
def create_solr_doc_files():
  batchsize = 100000
  solr_file_foler = "./settings/solr_files"
  db, cursor = init_db()
  print 'Querying paper contents'
  cursor.execute("""SELECT cluster, title, abstract, ncites FROM papers""")

  num_files = 0
  while True:
    rows = cursor.fetchmany(batchsize)
    if not rows:
      break
    num_files += 1
    print 'Generating doc file number ' + str(num_files)
    root = etree.Element('add')
    num_docs_to_gen = len(rows)
    for i, r in enumerate(rows):
      sys.stdout.write("\r%d / %d, id: %s" % (i+1, num_docs_to_gen, r[0]))
      doc = etree.Element('doc')
      id = etree.Element("field", name="id")
      id.text = str(r[0])
      title = etree.Element("field", name="title")
      assign_xml_node_text(title, r[1])
      abs = etree.Element("field", name="abstract")
      assign_xml_node_text(abs, r[2])
      ncites = etree.Element("field", name="ncites")
      ncites.text = str(r[3]) if r[3] is not None else '0'
      doc.append(id)
      doc.append(title)
      doc.append(abs)
      doc.append(ncites)
      root.append(doc)
    print ''
    f = open(os.path.join(solr_file_foler, 'papers' + str(num_files) + '.xml'), 'w')
    tree = etree.ElementTree(root)
    tree.write(f, encoding='utf8', pretty_print=True)
    f.close()
  close_db(db, cursor)
def worker(worker_id, cmd_queue, job_queue):
    db, cursor = mysql_util.init_db()

    try:
        while True:
            # get data to process
            cmd_queue.put(("GET", None))
            paper_info = job_queue.get()
            if paper_info == None:
                break

            # process data
            pid, term_ctr = calc_term_ctr(paper_info)
            save_to_tbl(db, cursor, pid, term_ctr)

            # report the the dispatcher that we have done one
            cmd_queue.put(("DONE", pid))
    except KeyboardInterrupt as e:
        pass
    finally:
        mysql_util.close_db(db, cursor)
        print
        print "Worker %d leaves" % worker_id
Beispiel #22
0
def worker(worker_id, cmd_queue, job_queue):
    db, cursor = mysql_util.init_db()

    try:
        while True:
            # get data to process
            cmd_queue.put(('GET', None))
            paper_info = job_queue.get()
            if paper_info == None:
                break

            # process data
            pid, term_ctr = calc_term_ctr(paper_info)
            save_to_tbl(db, cursor, pid, term_ctr)

            # report the the dispatcher that we have done one
            cmd_queue.put(('DONE', pid))
    except KeyboardInterrupt as e:
        pass
    finally:
        mysql_util.close_db(db, cursor)
        print
        print 'Worker %d leaves' % worker_id
def gen_keyphrase_info(trie):
  db, cursor = init_db()
  if not is_tbl_exists(db, cursor, 'ngram_relations'):
    cursor.execute('CREATE TABLE ngram_relations '  + \
        '(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \
        'src_id INT NOT NULL, ' + \
        'tar_id INT NOT NULL, ' + \
        'co_occur INT DEFAULT 0, ' + \
        'co_occur_norm FLOAT DEFAULT 0, ' + \
        'is_valid INT(1) DEFAULT 1, ' + \
        'UNIQUE src_tar_idx (src_id, tar_id))')

  batch_save_size = 100
  keyphrase_ctr = defaultdict(int)
  keyphrase_relation_ctr = defaultdict(lambda : defaultdict(int))
  cursor.execute("""SELECT title, abstract FROM papers""")
  rows = cursor.fetchall()
  num_rows = len(rows)
  sys.stdout.write('Generating keyphrase information\n')
  for i, r in enumerate(rows):
    sys.stdout.write("\r%d / %d" % (i+1, num_rows))
    contents = r[0].lower() + ' >>> ' if r[0] is not None else ''
    if r[1] is not None:
      contents += r[1].lower()
    inc_keyphrase_ctr(keyphrase_ctr, contents, trie)
    inc_keyphrase_relation_ctr(keyphrase_relation_ctr, contents, trie)
    if (i+1) % batch_save_size == 0:
      upd_keyphrase_ctr_to_table(db, cursor, keyphrase_ctr)
      upd_keyphrase_relation_ctr_to_table(db, cursor, keyphrase_relation_ctr)
      keyphrase_ctr = defaultdict(int)
      keyphrase_relation_ctr = defaultdict(lambda : defaultdict(int))
  upd_keyphrase_ctr_to_table(db, cursor, keyphrase_ctr)
  upd_keyphrase_relation_ctr_to_table(db, cursor, keyphrase_relation_ctr)
  upd_co_occur_norm()
  sys.stdout.write("\n")
  #cursor.execute('ALTER TABLE ngram_relations ADD UNIQUE INDEX (src_id, tar_id)')
  close_db(db, cursor)
Beispiel #24
0
 def get_ngram_info(self, term):
   db, cursor = init_db()
   cursor.execute("""SELECT name, freq FROM ngrams WHERE name=%s""", (term))
   row = cursor.fetchone()
   close_db(db, cursor)
   return row
Beispiel #25
0
def reset_tables():
  db, cursor = init_db()
  check_required_tables(db, cursor)
  reset_tables_for_testing(db, cursor)
  close_db(db, cursor)
 def test_is_db_exists(self):
   db, cursor = init_db()
   assert_equal(is_db_exists(db, cursor, 'this-is-a-not-existing-database'), False)
   assert_equal(is_db_exists(db, cursor, 'information_schema'), True)
   close_db(db, cursor)
doi2keyphrases = {}

with open('keyphrases') as csvfile:
    csvreader = csv.reader(csvfile, delimiter='\t')

    for row in csvreader:
        id = row[0]
        keyphrases = row[1:]

        if id not in hash2doi:
            continue
        doi = hash2doi[id]
        doi2keyphrases[doi] = keyphrases

db, cursor = mysql_util.init_db()

total = len(doi2keyphrases)
try:
    if not mysql_util.does_table_exist(db, cursor, 'paper_keywords_allenai'):
        cursor.execute('CREATE TABLE paper_keywords_allenai ('
            'id INT(11) NOT NULL AUTO_INCREMENT PRIMARY KEY, '
            'doi varchar(100), '
            'keyphrase varchar(255), '
            'INDEX (doi))')

    count = 1
    for doi, keyphrases in doi2keyphrases.iteritems():
        print "\r%d / %d: %s: %s" % (count, total, doi, str(keyphrases)),
        sys.stdout.flush()