def fill_doc_table(snippets_loc, docs_loc): print 'Process snippets and documents' # Get all sites qry = 'select distinct site_id from fedtask_site' sites = db.run_qry_with_results(qry, conn) # Get all topics qry = 'select distinct topic_id from fedtask_topic' topics = db.run_qry_with_results(qry, conn) # Get snippets of (site, topics) for s in sites: site_id = s[0] for t in topics: snippet_file = '%s/%s/%s.xml' % (snippets_loc, s[0], t[0]) tree = et.parse(snippet_file) root = tree.getroot() # Snippet element snippets = root.find('search_results')[2] # Get info of each doc: doc_id, title, summary, url for sn in snippets: docid = sn.get('id') url = sn.find('link') title = sn.find('title') summary = sn.find('description') # Page link s_url = url.text sn_url = '' if not s_url == None: sn_url = s_url.replace('"', '\\"') # HTML_location doc_loc = url.get('cache') if doc_loc == None: doc_loc = '' # title sn_title = '' if not title == None: s_title = title.text if not s_title == None: sn_title = s_title.replace('"', '\\"').encode('utf-8') # summary sn_summary = '' if not summary == None: s_summary = summary.text if not s_summary == None: s_summary = s_summary.replace('\\', '') sn_summary = s_summary.replace('"', '\\"').encode('utf-8') qry = 'insert into fedtask_document (doc_id, site_id, title, url, html_location, summary) values ("%s", "%s", "%s", "%s", "%s", "%s")' % ( docid, site_id, sn_title, sn_url, doc_loc, sn_summary) db.run_qry(qry, conn)
def fill_doc_table(snippets_loc, docs_loc): print 'Process snippets and documents' # Get all sites qry = 'select distinct site_id from fedtask_site' sites = db.run_qry_with_results(qry, conn) # Get all topics qry = 'select distinct topic_id from fedtask_topic' topics = db.run_qry_with_results(qry, conn) # Get snippets of (site, topics) for s in sites: site_id = s[0] for t in topics: snippet_file = '%s/%s/%s.xml'%(snippets_loc, s[0], t[0]) tree = et.parse(snippet_file) root = tree.getroot() # Snippet element snippets = root.find('search_results')[2] # Get info of each doc: doc_id, title, summary, url for sn in snippets: docid = sn.get('id') url = sn.find('link') title = sn.find('title') summary = sn.find('description') # Page link s_url = url.text sn_url = '' if not s_url == None: sn_url = s_url.replace('"', '\\"') # HTML_location doc_loc = url.get('cache') if doc_loc == None: doc_loc = '' # title sn_title = '' if not title == None: s_title = title.text if not s_title == None: sn_title = s_title.replace('"', '\\"').encode('utf-8') # summary sn_summary = '' if not summary == None: s_summary = summary.text if not s_summary == None: s_summary = s_summary.replace('\\', '') sn_summary = s_summary.replace('"', '\\"').encode('utf-8') qry = 'insert into fedtask_document (doc_id, site_id, title, url, html_location, summary) values ("%s", "%s", "%s", "%s", "%s", "%s")'%(docid, site_id, sn_title, sn_url, doc_loc, sn_summary) db.run_qry(qry, conn)
def fill_run_table(rundir): files = os.listdir(rundir) for runfile in files: f = open('%s/%s'%(rundir, runfile)) # Insert to run table qry = 'select max(run_id) from fedtask_run' res = db.run_qry_with_results(qry, conn) if res[0][0] == None: run_id = 1 else: run_id = res[0][0] + 1 run_desc = runfile qry = 'insert into fedtask_run (run_id, description) values (%s, "%s")'%(run_id, run_desc) db.run_qry(qry, conn) print 'Precessing run %s: %s'%(run_id, run_desc) current_q = '' docs = [] for c in f: strs = c.strip().split(' ') qid = strs[0] docid = strs[2] if not current_q == qid: if not current_q == '': ranklist = simplejson.dumps(docs) qry = "insert into fedtask_ranklist (run_id, topic_id, ranklist) values(%s, %s, '%s')"%(run_id, current_q, ranklist) db.run_qry(qry, conn) docs = [] current_q = qid docs.append(docid) ranklist = simplejson.dumps(docs) qry = "insert into fedtask_ranklist (run_id, topic_id, ranklist) values(%s, %s, '%s')"%(run_id, current_q, ranklist) db.run_qry(qry, conn) f.close()
def fill_run_table(rundir): files = os.listdir(rundir) for runfile in files: f = open('%s/%s' % (rundir, runfile)) # Insert to run table qry = 'select max(run_id) from fedtask_run' res = db.run_qry_with_results(qry, conn) if res[0][0] == None: run_id = 1 else: run_id = res[0][0] + 1 run_desc = runfile qry = 'insert into fedtask_run (run_id, description) values (%s, "%s")' % ( run_id, run_desc) db.run_qry(qry, conn) print 'Precessing run %s: %s' % (run_id, run_desc) current_q = '' docs = [] for c in f: strs = c.strip().split(' ') qid = strs[0] docid = strs[2] if not current_q == qid: if not current_q == '': ranklist = simplejson.dumps(docs) qry = "insert into fedtask_ranklist (run_id, topic_id, ranklist) values(%s, %s, '%s')" % ( run_id, current_q, ranklist) db.run_qry(qry, conn) docs = [] current_q = qid docs.append(docid) ranklist = simplejson.dumps(docs) qry = "insert into fedtask_ranklist (run_id, topic_id, ranklist) values(%s, %s, '%s')" % ( run_id, current_q, ranklist) db.run_qry(qry, conn) f.close()
db.run_qry(qry, conn) print 'Storing RUNS' qry = 'delete from fedtask_run' db.run_qry(qry, conn) for run_id in RUNS: qry = 'insert into fedtask_run (run_id, description)\ values(%s,"%s")' % (run_id, RUNS[run_id]) db.run_qry(qry, conn) # Make tasks print "Fill task table" qry = 'delete from fedtask_task' db.run_qry(qry, conn) qry = 'select run_id from fedtask_run' runs = db.run_qry_with_results(qry, conn) qry = 'select topic_id from fedtask_topic' topics = db.run_qry_with_results(qry, conn) # A task consists of: topic, run, ui task_id = 1 expmnt_indx = 0 for r in RUNS: for u in UI: # add the tasks to an experiment here expmnt = Experiment_description[expmnt_indx] for t in topics: expmnt["TASKS"].append(task_id) qry = 'insert into fedtask_task (task_id, run_id, topic_id, ui_id) values(%s, %s, %s, %s)' % ( task_id, r, t[0], u) db.run_qry(qry, conn) task_id += 1
print 'Storing RUNS' qry = 'delete from fedtask_run' db.run_qry(qry, conn) for run_id in RUNS: qry = 'insert into fedtask_run (run_id, description)\ values(%s,"%s")'%(run_id, RUNS[run_id]) db.run_qry(qry, conn) # Make tasks print "Fill task table" qry = 'delete from fedtask_task' db.run_qry(qry, conn) qry = 'select run_id from fedtask_run' runs = db.run_qry_with_results(qry, conn) qry = 'select topic_id from fedtask_topic' topics = db.run_qry_with_results(qry, conn) # A task consists of: topic, run, ui task_id = 1 expmnt_indx = 0 for r in RUNS: for u in UI: # add the tasks to an experiment here expmnt = Experiment_description[expmnt_indx] for t in topics: expmnt["TASKS"].append(task_id) qry = 'insert into fedtask_task (task_id, run_id, topic_id, ui_id) values(%s, %s, %s, %s)'%(task_id, r, t[0], u) db.run_qry(qry, conn) task_id += 1 expmnt_indx+=1