connection = MySQLdb.connect(host = "127.0.0.1", user = "******", passwd = "paper1ens", db = "paperlens") cursor = connection.cursor() connection.commit() try: paper_hash = dict() cursor.execute("select id,title from paper") n = 0 while 1: row = cursor.fetchone() if row == None: break paper_id = int(row[0]) title = row[1] paper_hash[paper_id] = paperlens_import.intHash(title.lower()) n = n + 1 if n % 10000 == 0: print str(n) n = 0 for (paper_id, hash_value) in paper_hash.items(): cursor.execute("update paper set hashvalue=%s where id=%s",(hash_value,paper_id)) n = n + 1 if n % 10000 == 0: print str(n) connection.commit() cursor.close() connection.close() except MySQLdb.Error, e: print e.args[0], e.args[1]
connection.commit() try: data = open("../../../data/citeseer.txt") cursor.execute("truncate table paper_citeseer") cursor.execute("truncate table cite_citeseer") citeseer_id_map = dict() title = '' citeseer_id = '' n = 0 for line in data: (key, value) = Extract(line) if line.find("<record>") >= 0: if len(title) > 20: hashvalue = paperlens_import.intHash(title.lower()) cursor.execute("select count(*),id from paper where hashvalue=%s",(hashvalue)) row = cursor.fetchone() if int(row[0]) == 1: paper_id = int(row[1]) if citeseer_id not in citeseer_id_map: citeseer_id_map[citeseer_id] = paper_id #cursor.execute("replace into paper_citeseer (paper_id, citeseer_key) values (%s, %s)",(paper_id, citeseer_id)) if n % 10000 == 0: print n, title, citeseer_id n = n + 1 title = '' citeseer_id = '' if key == "<dc:title>":
passwd="paper1ens", db="paperlens") cursor = connection.cursor() connection.commit() try: data = open("../../../data/citeseer.txt") cursor.execute("truncate table paper_citeseer") title = '' citeseer_id = '' n = 0 for line in data: (key, value) = Extract(line) if line.find("<record>") >= 0: if len(title) > 20: hashvalue = paperlens_import.intHash(title.lower()) cursor.execute( "select count(*),id from paper where hashvalue=%s", (hashvalue)) row = cursor.fetchone() if int(row[0]) == 1: paper_id = int(row[1]) cursor.execute( "replace into paper_citeseer (paper_id, citeseer_key) values (%s, %s)", (paper_id, citeseer_id)) if n % 10000 == 0: print n, title, citeseer_id n = n + 1 title = ''