def dow_xml_to_mysql(): xmlschema_doc = etree.parse(FLAGS.schema_file) xmlschema = etree.XMLSchema(xmlschema_doc) db, cursor = init_db() for root, sub_folders, files in os.walk(FLAGS.xml_dir): for file in files: # recursively find xml files if os.path.splitext(file)[1] == '.xml': tree = etree.parse(os.path.join(root, file)) # check schema if not xmlschema.validate(tree): raise Exception("Incorrect schema of xml file " + os.path.join(root, file)) # parse xml for doc in tree.xpath('dbo_RICKMETADATA'): (id, title, authors, abs, url) = ('', '', [], '', '') id = get_element_value(doc, 'ID') title = ' '.join(get_element_value(doc, 'TITL').splitlines()).strip() authors = clean_authors(';'.join(get_element_value(doc, 'AUTH').splitlines()).split(';')) abs = ' '.join(get_element_value(doc, 'ABST').splitlines()).strip() url = get_element_value(doc, 'LINK') # save to mysql if id != '': insert_tables(db, cursor, id, title, authors, abs, url) close_db(db, cursor)
def get_personal_keyword_info(self, person_cluster, ngram_id): db, cursor = init_db() cursor.execute("""SELECT count, log_cite_prod_count FROM personal_keywords """ + \ """WHERE person_cluster = %s AND ngram_id = %s""", (person_cluster, ngram_id)) row = cursor.fetchone() close_db(db, cursor) return row
def main(argv): NUM_PROCESSES = multiprocessing.cpu_count() * 2 cmd_queue = multiprocessing.Queue() job_queue = multiprocessing.Queue() print "Create %d workers..." % NUM_PROCESSES for i in range(NUM_PROCESSES): multiprocessing.Process(target=worker, args=(i, cmd_queue, job_queue)).start() # fetch data from database db, cursor = mysql_util.init_db() try: if not mysql_util.does_table_exist(db, cursor, 'paper_keywords_noun'): cursor.execute('CREATE TABLE paper_keywords_noun (' 'id INT(11) NOT NULL AUTO_INCREMENT PRIMARY KEY, ' 'paper_id varchar(100), ' 'ngram varchar(255), ' 'count int(11), ' 'INDEX (paper_id))') print "Fetching papers from database..." cursor.execute("""SELECT id, title, abstract FROM papers""") papers = cursor.fetchall() dispatch_work(cmd_queue, job_queue, papers) except KeyboardInterrupt as e: pass finally: mysql_util.close_db(db, cursor) print "Main process leaves"
def main(argv): NUM_PROCESSES = multiprocessing.cpu_count() * 2 cmd_queue = multiprocessing.Queue() job_queue = multiprocessing.Queue() print "Create %d workers..." % NUM_PROCESSES for i in range(NUM_PROCESSES): multiprocessing.Process(target=worker, args=(i, cmd_queue, job_queue)).start() # fetch data from database db, cursor = mysql_util.init_db() try: if not mysql_util.does_table_exist(db, cursor, "paper_keywords_noun"): cursor.execute( "CREATE TABLE paper_keywords_noun (" "id INT(11) NOT NULL AUTO_INCREMENT PRIMARY KEY, " "paper_id varchar(100), " "ngram varchar(255), " "count int(11), " "INDEX (paper_id))" ) print "Fetching papers from database..." cursor.execute("""SELECT id, title, abstract FROM papers""") papers = cursor.fetchall() dispatch_work(cmd_queue, job_queue, papers) except KeyboardInterrupt as e: pass finally: mysql_util.close_db(db, cursor) print "Main process leaves"
def get_ngram_co_occur_info(self, min_co_occur=1): db, cursor = init_db() cursor.execute("""SELECT src_id, tar_id FROM ngram_relations """ + \ """WHERE co_occur >= %s""", (min_co_occur)) rows = cursor.fetchall() close_db(db, cursor) return [(r[0], r[1]) for r in rows]
def drop_tables(): print 'Dropping tables' db, cursor = mysql_util.init_db() tables_to_be_removed = mysql_util.get_all_tbl(db, cursor) for tbl in tables_to_be_removed: mysql_util.drop_tbl(db, cursor, tbl) mysql_util.close_db(db, cursor)
def get_experts_from_keywords(self, keyword): db, cursor = init_db() cursor.execute("""SELECT authors.name FROM authors, ngrams, personal_keywords """ + \ """WHERE ngrams.name=%s AND personal_keywords.ngram_id = ngrams.id """ + \ """AND authors.cluster=personal_keywords.person_cluster """ + \ """GROUP BY authors.name""", (keyword)) rows = cursor.fetchall() close_db(db, cursor) return [r[0] for r in rows]
def get_expertise_from_author(self, author): db, cursor = init_db() cursor.execute("""SELECT ngrams.name FROM ngrams, personal_keywords, authors """ + \ """WHERE ngrams.id=personal_keywords.ngram_id AND """ + \ """personal_keywords.person_cluster = authors.cluster AND """ + \ """ngrams.is_valid AND authors.name=%s GROUP BY ngrams.name""", (author)) rows = cursor.fetchall() close_db(db, cursor) return [r[0] for r in rows]
def get_author_clusters(): db, cursor = init_db() author_clusters = [ ] cursor.execute("""SELECT distinct(cluster) FROM authors""") rows = cursor.fetchall() for r in rows: author_clusters.append(r[0]) close_db(db, cursor) return author_clusters
def main(argv): check_args(argv) db, cursor = init_db() try: cursor.execute("""DELETE FROM paper_keywords_noun WHERE ngram=%s""", (FLAGS.drop_keyword)) db.commit() except: sys.stdout.write("""Error in deleting paper_keywords_noun where ngram = %s""" % (FLAGS.drop_keyword)) db.rollback() close_db(db, cursor)
def add_to_ngrams(user_defined_terms): db, cursor = init_db() for term in user_defined_terms: if not is_term_in_ngrams(db, cursor, term): try: cursor.execute("""INSERT INTO ngrams (name, n) VALUES (%s, %s)""", \ (term, len(term.split()))) db.commit() except: sys.stderr.write('Error in inserting table ngrams "' + term + '"\n') db.rollback() close_db(db, cursor)
def save_root_categories_to_db(root_categories): db, cursor = init_db() table_name = 'root_categories' if not is_tbl_exists(db, cursor, table_name): cursor.execute('CREATE TABLE ' + table_name + \ ' (id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \ ' name VARCHAR(100) NOT NULL)') for root_category in root_categories: if not is_root_category_in_db(db, cursor, root_category[0]): try: cursor.execute("""INSERT INTO root_categories (name) VALUES (%s)""", (root_category[0],)) except Exception, e: print repr(e)
def invalidate_stopwords_from_keyphrases(stopwords): db, cursor = init_db() if not is_tbl_exists(db, cursor, 'ngrams'): raise Exception('Table "ngrams" does not exist!') for stopword in stopwords: try: cursor.execute("""UPDATE ngrams SET is_valid=0 WHERE name=%s""", \ (stopword)) db.commit() except: db.rollback() raise Exception('Error in updating table "ngrams"') close_db(db, cursor)
def save_keyphrases_to_table(keyphrases): db, cursor = init_db() if not is_tbl_exists(db, cursor, 'ngrams'): cursor.execute('CREATE TABLE ngrams ' + \ '(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \ 'name varchar(255) NOT NULL UNIQUE, ' + \ 'n INT DEFAULT 0, ' + \ 'freq INT DEFAULT 0, ' + \ 'is_valid int(1) DEFAULT 1)') for k in keyphrases: ngram_id = get_ngram_id_by_name(db, cursor, k) if ngram_id == None: save_one_keyphrase_to_table(db, cursor, k) close_db(db, cursor)
def gen_trie(): db, cursor = init_db() trie = { } cursor.execute("""SELECT id, name FROM ngrams WHERE is_valid = 1""") rows = cursor.fetchall() n_rows = len(rows) sys.stdout.write('Generating trie\n') for i, r in enumerate(rows): sys.stdout.write("\r%d / %d" % (i+1, n_rows)) ngram_id = r[0] name = r[1] insert_trie(trie, ngram_id, name) sys.stdout.write('\n') close_db(db, cursor) return trie
def get_rand_papers(eval_paper_num): ''' Return: list of tuples where each tuple contains (paper_cluster_id, paper_title, and paper_abstract) ''' paper_infos = [ ] db, cursor = init_db() cursor.execute("""SELECT cluster, title, abstract FROM papers WHERE """ + \ """title IS NOT NULL AND title <> '' AND """ + \ """abstract IS NOT NULL AND abstract <> ''""") rows = cursor.fetchall() sampled_idx = random.sample(xrange(len(rows)), min(len(rows), eval_paper_num)) for idx in sampled_idx: paper_infos.append((rows[idx])) close_db(db, cursor) return paper_infos
def create_solr_doc_files(table_name, field_mapping, solr_file_folder, solr_doc_filename_prefix): batchsize = 100000 db, cursor = mysql_util.init_db() print 'Querying database' sql = 'SELECT %s FROM %s' % (','.join(field_mapping.keys()), table_name) cursor.execute(sql) num_files = 0 while True: rows = cursor.fetchmany(batchsize) if not rows: break num_files += 1 sys.stdout.write('Generating doc file number %d\n' % num_files) root = etree.Element('add') for i, r in enumerate(rows): sys.stdout.write("\r%d / %d" % (i+1, len(rows))) doc = etree.Element('doc') for j, col in enumerate(r): (field_name, field_type) = field_mapping.values()[j] ele = etree.Element("field", name=field_name) if field_type in ['string', 'text_general']: assign_xml_node_text(ele, col) if col is not None else '' elif field_type in ['int', 'float', 'long', 'double']: ele.text = str(col) if col is not None else '0' elif field_type in ['date']: ele.text = col.strftime("%Y-%m-%dT%H:%M:%SZ") if col is not None else '' elif field_type in ['boolean']: ele.text = 'true' if col else 'false' else: raise Exception('\nUndefined field type "%s"' % (field_type)) doc.append(ele) root.append(doc) print '' if not os.path.isdir(solr_file_folder): os.mkdir(solr_file_folder) f = open(os.path.join(solr_file_folder, '%s%d.xml' % (solr_doc_filename_prefix, num_files)), 'w') tree = etree.ElementTree(root) tree.write(f, encoding='utf8', pretty_print=True) f.close() mysql_util.close_db(db, cursor)
def upd_co_occur_norm(): db, cursor = init_db() total_freq = float(get_total_freq(db, cursor)) cursor.execute("""SELECT id, src_id, tar_id, co_occur FROM ngram_relations""") rows = cursor.fetchall() for r in rows: id = r[0] src_freq_prob = get_freq_by_ngram_id(db, cursor, r[1]) / total_freq tar_freq_prob = get_freq_by_ngram_id(db, cursor, r[2]) / total_freq co_occur_prob = r[3] / total_freq co_occur_norm = normalize(pmi(src_freq_prob, tar_freq_prob, co_occur_prob) / (-math.log(co_occur_prob)), -1, 1, 1, 5) try: cursor.execute("""UPDATE ngram_relations SET co_occur_norm = %s WHERE id = %s""", (co_occur_norm, id)) db.commit() except: sys.stderr.write('Error in updating table ngrams of id "' + str(id) + '"\n') db.rollback() close_db(db, cursor)
def gen_author_keyphrase(trie): # ex: Assuming sean's author cluster is 203, sean is good at nuclear (id = 2111) and oxygen (id=76), # then: author_keyphrase[203] = {2111: 3, 76: 5}, where 3 and 5 are the appearance frequency # of nuclear and oxygen in his publications batch_save_size = 100 db, cursor = init_db() if not is_tbl_exists(db, cursor, 'personal_keywords'): cursor.execute('CREATE TABLE personal_keywords (' + \ 'id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \ 'person_cluster int, ' + \ 'ngram_id int, ' + \ 'year int, ' + \ 'count int, ' + \ 'log_cite_prod_count float)') author_clusters = get_author_clusters() author_keyphrase = { } sys.stdout.write("Generating author_keyphrase\n") n_author_clusters = len(author_clusters) for i, author_cluster in enumerate(author_clusters): sys.stdout.write("\r%d / %d" % (i+1, n_author_clusters)) cursor.execute("""SELECT authors.cluster, papers.title, """ + \ """papers.abstract, papers.ncites FROM authors, papers WHERE """ + \ """authors.paper_cluster = papers.cluster and authors.cluster = %s""", (author_cluster)) rows = cursor.fetchall() for r in rows: author_cluster = r[0] contents = r[1].lower() + ' >>> ' if r[1] is not None else '' if r[2] is not None: contents += r[2].lower() ncites = r[3] term_ctr = gen_term_ctr(contents, trie) upd_author_keyphrase(author_keyphrase, author_cluster, term_ctr, ncites) if (i+1) % batch_save_size == 0: save_author_keyphrase_to_table(db, cursor, author_keyphrase) author_keyphrase = { } save_author_keyphrase_to_table(db, cursor, author_keyphrase) sys.stdout.write("\nCreating indexes...\n") cursor.execute('ALTER TABLE personal_keywords ADD INDEX (person_cluster), ' + \ 'ADD INDEX (ngram_id), ADD INDEX (year)') close_db(db, cursor)
def create_solr_doc_files(): batchsize = 100000 solr_file_foler = "./settings/solr_files" db, cursor = init_db() print 'Querying paper contents' cursor.execute("""SELECT cluster, title, abstract, ncites FROM papers""") num_files = 0 while True: rows = cursor.fetchmany(batchsize) if not rows: break num_files += 1 print 'Generating doc file number ' + str(num_files) root = etree.Element('add') num_docs_to_gen = len(rows) for i, r in enumerate(rows): sys.stdout.write("\r%d / %d, id: %s" % (i+1, num_docs_to_gen, r[0])) doc = etree.Element('doc') id = etree.Element("field", name="id") id.text = str(r[0]) title = etree.Element("field", name="title") assign_xml_node_text(title, r[1]) abs = etree.Element("field", name="abstract") assign_xml_node_text(abs, r[2]) ncites = etree.Element("field", name="ncites") ncites.text = str(r[3]) if r[3] is not None else '0' doc.append(id) doc.append(title) doc.append(abs) doc.append(ncites) root.append(doc) print '' f = open(os.path.join(solr_file_foler, 'papers' + str(num_files) + '.xml'), 'w') tree = etree.ElementTree(root) tree.write(f, encoding='utf8', pretty_print=True) f.close() close_db(db, cursor)
def worker(worker_id, cmd_queue, job_queue): db, cursor = mysql_util.init_db() try: while True: # get data to process cmd_queue.put(("GET", None)) paper_info = job_queue.get() if paper_info == None: break # process data pid, term_ctr = calc_term_ctr(paper_info) save_to_tbl(db, cursor, pid, term_ctr) # report the the dispatcher that we have done one cmd_queue.put(("DONE", pid)) except KeyboardInterrupt as e: pass finally: mysql_util.close_db(db, cursor) print print "Worker %d leaves" % worker_id
def worker(worker_id, cmd_queue, job_queue): db, cursor = mysql_util.init_db() try: while True: # get data to process cmd_queue.put(('GET', None)) paper_info = job_queue.get() if paper_info == None: break # process data pid, term_ctr = calc_term_ctr(paper_info) save_to_tbl(db, cursor, pid, term_ctr) # report the the dispatcher that we have done one cmd_queue.put(('DONE', pid)) except KeyboardInterrupt as e: pass finally: mysql_util.close_db(db, cursor) print print 'Worker %d leaves' % worker_id
def gen_keyphrase_info(trie): db, cursor = init_db() if not is_tbl_exists(db, cursor, 'ngram_relations'): cursor.execute('CREATE TABLE ngram_relations ' + \ '(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \ 'src_id INT NOT NULL, ' + \ 'tar_id INT NOT NULL, ' + \ 'co_occur INT DEFAULT 0, ' + \ 'co_occur_norm FLOAT DEFAULT 0, ' + \ 'is_valid INT(1) DEFAULT 1, ' + \ 'UNIQUE src_tar_idx (src_id, tar_id))') batch_save_size = 100 keyphrase_ctr = defaultdict(int) keyphrase_relation_ctr = defaultdict(lambda : defaultdict(int)) cursor.execute("""SELECT title, abstract FROM papers""") rows = cursor.fetchall() num_rows = len(rows) sys.stdout.write('Generating keyphrase information\n') for i, r in enumerate(rows): sys.stdout.write("\r%d / %d" % (i+1, num_rows)) contents = r[0].lower() + ' >>> ' if r[0] is not None else '' if r[1] is not None: contents += r[1].lower() inc_keyphrase_ctr(keyphrase_ctr, contents, trie) inc_keyphrase_relation_ctr(keyphrase_relation_ctr, contents, trie) if (i+1) % batch_save_size == 0: upd_keyphrase_ctr_to_table(db, cursor, keyphrase_ctr) upd_keyphrase_relation_ctr_to_table(db, cursor, keyphrase_relation_ctr) keyphrase_ctr = defaultdict(int) keyphrase_relation_ctr = defaultdict(lambda : defaultdict(int)) upd_keyphrase_ctr_to_table(db, cursor, keyphrase_ctr) upd_keyphrase_relation_ctr_to_table(db, cursor, keyphrase_relation_ctr) upd_co_occur_norm() sys.stdout.write("\n") #cursor.execute('ALTER TABLE ngram_relations ADD UNIQUE INDEX (src_id, tar_id)') close_db(db, cursor)
def get_ngram_info(self, term): db, cursor = init_db() cursor.execute("""SELECT name, freq FROM ngrams WHERE name=%s""", (term)) row = cursor.fetchone() close_db(db, cursor) return row
def reset_tables(): db, cursor = init_db() check_required_tables(db, cursor) reset_tables_for_testing(db, cursor) close_db(db, cursor)
def test_is_db_exists(self): db, cursor = init_db() assert_equal(is_db_exists(db, cursor, 'this-is-a-not-existing-database'), False) assert_equal(is_db_exists(db, cursor, 'information_schema'), True) close_db(db, cursor)
doi2keyphrases = {} with open('keyphrases') as csvfile: csvreader = csv.reader(csvfile, delimiter='\t') for row in csvreader: id = row[0] keyphrases = row[1:] if id not in hash2doi: continue doi = hash2doi[id] doi2keyphrases[doi] = keyphrases db, cursor = mysql_util.init_db() total = len(doi2keyphrases) try: if not mysql_util.does_table_exist(db, cursor, 'paper_keywords_allenai'): cursor.execute('CREATE TABLE paper_keywords_allenai (' 'id INT(11) NOT NULL AUTO_INCREMENT PRIMARY KEY, ' 'doi varchar(100), ' 'keyphrase varchar(255), ' 'INDEX (doi))') count = 1 for doi, keyphrases in doi2keyphrases.iteritems(): print "\r%d / %d: %s: %s" % (count, total, doi, str(keyphrases)), sys.stdout.flush()