Example #1
0
def fill_doc_table(snippets_loc, docs_loc):
    print 'Process snippets and documents'
    # Get all sites
    qry = 'select distinct site_id from fedtask_site'
    sites = db.run_qry_with_results(qry, conn)
    # Get all topics
    qry = 'select distinct topic_id from fedtask_topic'
    topics = db.run_qry_with_results(qry, conn)
    # Get snippets of (site, topics)
    for s in sites:
        site_id = s[0]
        for t in topics:
            snippet_file = '%s/%s/%s.xml' % (snippets_loc, s[0], t[0])
            tree = et.parse(snippet_file)
            root = tree.getroot()
            # Snippet element
            snippets = root.find('search_results')[2]
            # Get info of each doc: doc_id, title, summary, url
            for sn in snippets:
                docid = sn.get('id')
                url = sn.find('link')
                title = sn.find('title')
                summary = sn.find('description')
                # Page link
                s_url = url.text
                sn_url = ''
                if not s_url == None:
                    sn_url = s_url.replace('"', '\\"')
                # HTML_location
                doc_loc = url.get('cache')
                if doc_loc == None:
                    doc_loc = ''
                # title
                sn_title = ''
                if not title == None:
                    s_title = title.text
                    if not s_title == None:
                        sn_title = s_title.replace('"', '\\"').encode('utf-8')
                # summary
                sn_summary = ''
                if not summary == None:
                    s_summary = summary.text
                    if not s_summary == None:
                        s_summary = s_summary.replace('\\', '')
                        sn_summary = s_summary.replace('"',
                                                       '\\"').encode('utf-8')

                qry = 'insert into fedtask_document (doc_id, site_id, title, url, html_location, summary) values ("%s", "%s", "%s", "%s", "%s", "%s")' % (
                    docid, site_id, sn_title, sn_url, doc_loc, sn_summary)
                db.run_qry(qry, conn)
Example #2
0
def fill_doc_table(snippets_loc, docs_loc):
	print 'Process snippets and documents'
	# Get all sites
	qry = 'select distinct site_id from fedtask_site'
	sites = db.run_qry_with_results(qry, conn)
	# Get all topics
	qry = 'select distinct topic_id from fedtask_topic'
	topics = db.run_qry_with_results(qry, conn)
	# Get snippets of (site, topics)
	for s in sites:
		site_id = s[0]
		for t in topics:
			snippet_file = '%s/%s/%s.xml'%(snippets_loc, s[0], t[0])
			tree = et.parse(snippet_file)
			root = tree.getroot()
			# Snippet element
			snippets = root.find('search_results')[2]
			# Get info of each doc: doc_id, title, summary, url
			for sn in snippets:
				docid = sn.get('id')
				url = sn.find('link')
				title = sn.find('title')
				summary = sn.find('description')
				# Page link
				s_url = url.text
				sn_url = ''
				if not s_url == None:
					sn_url = s_url.replace('"', '\\"')
				# HTML_location
				doc_loc = url.get('cache')
				if doc_loc == None:
					doc_loc = ''
				# title
				sn_title = ''
				if not title == None:
					s_title = title.text
					if not s_title == None:
						sn_title = s_title.replace('"', '\\"').encode('utf-8')
				# summary
				sn_summary = ''
				if not summary == None: 
					s_summary = summary.text					
					if not s_summary == None:
						s_summary = s_summary.replace('\\', '') 
						sn_summary = s_summary.replace('"', '\\"').encode('utf-8')

				qry = 'insert into fedtask_document (doc_id, site_id, title, url, html_location, summary) values ("%s", "%s", "%s", "%s", "%s", "%s")'%(docid, site_id, sn_title, sn_url, doc_loc, sn_summary)	
				db.run_qry(qry, conn)
Example #3
0
def fill_run_table(rundir):
	files = os.listdir(rundir)
	for runfile in files:
		f = open('%s/%s'%(rundir, runfile))
		# Insert to run table
		qry = 'select max(run_id) from fedtask_run'
		res = db.run_qry_with_results(qry, conn)
		if res[0][0] == None:
			run_id = 1
		else:
			run_id = res[0][0] + 1

		run_desc = runfile
		qry = 'insert into fedtask_run (run_id, description) values (%s, "%s")'%(run_id, run_desc)
		db.run_qry(qry, conn)

		print 'Precessing run %s: %s'%(run_id, run_desc)
		current_q = ''
		docs = []
		for c in f:
			strs = c.strip().split(' ')
			qid = strs[0]
			docid = strs[2]	
			if not current_q == qid:
				if not current_q == '':
					ranklist = simplejson.dumps(docs)
					qry = "insert into fedtask_ranklist (run_id, topic_id, ranklist) values(%s, %s, '%s')"%(run_id, current_q, ranklist)
					db.run_qry(qry, conn)
					docs = []
				current_q = qid
			docs.append(docid)
		ranklist = simplejson.dumps(docs)
		qry = "insert into fedtask_ranklist (run_id, topic_id, ranklist) values(%s, %s, '%s')"%(run_id, current_q, ranklist)
		db.run_qry(qry, conn)
		f.close()	
Example #4
0
def fill_run_table(rundir):
    files = os.listdir(rundir)
    for runfile in files:
        f = open('%s/%s' % (rundir, runfile))
        # Insert to run table
        qry = 'select max(run_id) from fedtask_run'
        res = db.run_qry_with_results(qry, conn)
        if res[0][0] == None:
            run_id = 1
        else:
            run_id = res[0][0] + 1

        run_desc = runfile
        qry = 'insert into fedtask_run (run_id, description) values (%s, "%s")' % (
            run_id, run_desc)
        db.run_qry(qry, conn)

        print 'Precessing run %s: %s' % (run_id, run_desc)
        current_q = ''
        docs = []
        for c in f:
            strs = c.strip().split(' ')
            qid = strs[0]
            docid = strs[2]
            if not current_q == qid:
                if not current_q == '':
                    ranklist = simplejson.dumps(docs)
                    qry = "insert into fedtask_ranklist (run_id, topic_id, ranklist) values(%s, %s, '%s')" % (
                        run_id, current_q, ranklist)
                    db.run_qry(qry, conn)
                    docs = []
                current_q = qid
            docs.append(docid)
        ranklist = simplejson.dumps(docs)
        qry = "insert into fedtask_ranklist (run_id, topic_id, ranklist) values(%s, %s, '%s')" % (
            run_id, current_q, ranklist)
        db.run_qry(qry, conn)
        f.close()
Example #5
0
    db.run_qry(qry, conn)

print 'Storing RUNS'
qry = 'delete from fedtask_run'
db.run_qry(qry, conn)
for run_id in RUNS:
    qry = 'insert into fedtask_run (run_id, description)\
			values(%s,"%s")' % (run_id, RUNS[run_id])
    db.run_qry(qry, conn)

# Make tasks
print "Fill task table"
qry = 'delete from fedtask_task'
db.run_qry(qry, conn)
qry = 'select run_id from fedtask_run'
runs = db.run_qry_with_results(qry, conn)
qry = 'select topic_id from fedtask_topic'
topics = db.run_qry_with_results(qry, conn)

# A task consists of: topic, run, ui
task_id = 1
expmnt_indx = 0
for r in RUNS:
    for u in UI:  # add the tasks to an experiment here
        expmnt = Experiment_description[expmnt_indx]
        for t in topics:
            expmnt["TASKS"].append(task_id)
            qry = 'insert into fedtask_task (task_id, run_id, topic_id, ui_id) values(%s, %s, %s, %s)' % (
                task_id, r, t[0], u)
            db.run_qry(qry, conn)
            task_id += 1
Example #6
0
print 'Storing RUNS'
qry = 'delete from fedtask_run'
db.run_qry(qry, conn)
for run_id in RUNS:
	qry = 'insert into fedtask_run (run_id, description)\
			values(%s,"%s")'%(run_id, RUNS[run_id])
	db.run_qry(qry, conn)


# Make tasks	
print "Fill task table"
qry = 'delete from fedtask_task'
db.run_qry(qry, conn)
qry = 'select run_id from fedtask_run'
runs = db.run_qry_with_results(qry, conn)
qry = 'select topic_id from fedtask_topic'
topics = db.run_qry_with_results(qry, conn)

# A task consists of: topic, run, ui
task_id = 1
expmnt_indx = 0
for r in RUNS:
	for u in UI: # add the tasks to an experiment here
		expmnt = Experiment_description[expmnt_indx]
		for t in topics:
			expmnt["TASKS"].append(task_id)
			qry = 'insert into fedtask_task (task_id, run_id, topic_id, ui_id) values(%s, %s, %s, %s)'%(task_id, r, t[0], u)
			db.run_qry(qry, conn)
			task_id += 1
		expmnt_indx+=1