def recoverSubtopic(): subtopic_id = int(sys.argv[1]) atn_db = DBHandler('./database/test.db') atn_db.cur.execute('UPDATE subtopic SET state=0 WHERE subtopic_id=?', [subtopic_id]) atn_db.cur.execute( ''' UPDATE filter_list SET state=1 WHERE topic_id = (SELECT topic_id FROM subtopic WHERE subtopic_id=?) AND docno IN ( SELECT DISTINCT passage.docno FROM passage WHERE passage.subtopic_id=? AND passage.state=0) AND state!=1 ''',[subtopic_id, subtopic_id]) atn_db.cur.execute( ''' INSERT INTO filter_list (topic_id, docno, state) SELECT DISTINCT subtopic.topic_id, passage.docno, 1 FROM subtopic, passage WHERE subtopic.subtopic_id = passage.subtopic_id AND subtopic.subtopic_id=? AND passage.state = 0 AND passage.docno NOT in (SELECT docno FROM filter_list WHERE topic_id = subtopic.topic_id); ''', [subtopic_id]) atn_db.commit() atn_db.close()
def getDocList1(): topic_id, subtopic_id = int(sys.argv[1]), int(sys.argv[2]) atn_db = DBHandler('../../../database/test.db') atn_db.cur.execute( 'SELECT userid, domain_id, topic_name FROM topic WHERE topic_id=?', [topic_id]) userid, domain_id, topic_name = atn_db.cur.fetchone() atn_db.cur.execute('SELECT username FROM user WHERE userid=?', [userid]) username, = atn_db.cur.fetchone() atn_db.cur.execute( 'SELECT subtopic_name FROM subtopic WHERE subtopic_id=?', [subtopic_id]) subtopic_name, = atn_db.cur.fetchone() corpus = ['EBOLA', 'POLAR', 'WEAPON'][domain_id - 1] r = requests.get( nistURL + "CMD=UID=%d TID=%d STID=%d.%d CO=%s CMD=MORE_LIKE_THIS DATA=-" % (userid, topic_id, topic_id, subtopic_id, corpus), verify=False) #mylog.log_nist_findmore(username, sys.argv[1], topic_name, sys.argv[2], subtopic_name+"::"+r.url+"::") docs = r.content.split('\n') for doc in docs: if doc: print doc.split()[0]
def userAuthentication(username, password): user_db = DBHandler(db_path.user) result = None user_db.cur.execute( 'SELECT userid, username, usercookie FROM user WHERE username = ? AND password = ?', [username, password]) result = user_db.cur.fetchone() user_db.close() return result
def dupsummary(): atn_db = DBHandler("./database/test.db") fh = open('./view/nonrelevant.csv','w') atn_db.cur.execute(''' SELECT filter_list.topic_id, filter_list.docno FROM filter_list, topic WHERE filter_list.topic_id=topic.topic_id AND topic.state!=2 AND topic.userid<=6 AND filter_list.state=2 ORDER BY filter_list.topic_id ''') dups = atn_db.cur.fetchall() for dup in dups: fh.write(str(dup[0])+','+dup[1]+'\n') fh.close()
def cookieAuthentication(env): user_db = DBHandler(db_path.user) result = None if 'HTTP_COOKIE' in env: for pair in env['HTTP_COOKIE'].split(';'): cookie = pair.strip() if cookie.startswith('usercookie'): key, value = cookie.split('=') user_db.cur.execute( 'SELECT userid, username, usercookie FROM user WHERE usercookie = ?', [ value, ]) result = user_db.cur.fetchone() break user_db.close() return result
def dupTopic(): userid = 30 topic_id = 391 # copy this topic to this userid atn_db = DBHandler('./database/test.db') atn_db.insert('topic', [ None, "slums and orphans _ debug", None, userid, 1, 'L', 'L', '', '', 0 ]) new_tid = atn_db.cur.lastrowid atn_db.cur.execute('SELECT * FROM subtopic WHERE topic_id=? AND state=0', [topic_id]) subtopics = atn_db.cur.fetchall() for subtopic in subtopics: atn_db.insert('subtopic', [None, subtopic[1] + ' _ debug', new_tid, 0, 0]) new_sid = atn_db.cur.lastrowid atn_db.cur.execute( 'SELECT * FROM passage WHERE subtopic_id=? AND state=0', [subtopic[0]]) passages = atn_db.cur.fetchall() for passage in passages: atn_db.insert( 'passage', [None, passage[1], passage[2], 0, 0, passage[5], new_sid, 0]) atn_db.cur.execute('SELECT docno, state FROM filter_list WHERE topic_id=?', [topic_id]) fdocs = atn_db.cur.fetchall() for fdoc in fdocs: docno, state = fdoc atn_db.insert('filter_list', [new_tid, docno, state]) atn_db.commit() atn_db.close()
from flask_cors import CORS from util import load_config from database import DBHandler from constants import TOPICS, COUNTRIES here = os.path.dirname(os.path.abspath(__file__)) cfg = load_config() app = Flask(__name__) CORS(app, origins=cfg['access_control_allow_origin']) mongo = DBHandler( host=cfg['database']['host'], port=cfg['database']['port'], db_name=cfg['database']['db_name'], collection_name=cfg['database']['collection_name'], es_host=cfg['es']['host'], es_port=cfg['es']['port'], ) class InvalidUsage(Exception): status_code = 400 def __init__(self, message, status_code=None, payload=None): Exception.__init__(self) self.message = message if status_code is not None: self.status_code = status_code self.payload = payload
def __init__(self, db_conn, url_file): """Initialize the crawler with a connection to the database to populate and with the file containing the list of seed URLs to begin indexing.""" self._url_queue = [] self._doc_id_cache = {} self._word_id_cache = {} self._url_list = {} self._word_list = {} self._inverted_index = {} self._resolved_inverted_index = {} self._link_list = [] self._db = DBHandler() # functions to call when entering and exiting specific tags self._enter = defaultdict(lambda *a, **ka: self._visit_ignore) self._exit = defaultdict(lambda *a, **ka: self._visit_ignore) # add a link to our graph, and indexing info to the related page self._enter['a'] = self._visit_a # record the currently indexed document's title an increase # the font size def visit_title(*args, **kargs): self._visit_title(*args, **kargs) self._increase_font_factor(7)(*args, **kargs) # increase the font size when we enter these tags self._enter['b'] = self._increase_font_factor(2) self._enter['strong'] = self._increase_font_factor(2) self._enter['i'] = self._increase_font_factor(1) self._enter['em'] = self._increase_font_factor(1) self._enter['h1'] = self._increase_font_factor(7) self._enter['h2'] = self._increase_font_factor(6) self._enter['h3'] = self._increase_font_factor(5) self._enter['h4'] = self._increase_font_factor(4) self._enter['h5'] = self._increase_font_factor(3) self._enter['title'] = visit_title # decrease the font size when we exit these tags self._exit['b'] = self._increase_font_factor(-2) self._exit['strong'] = self._increase_font_factor(-2) self._exit['i'] = self._increase_font_factor(-1) self._exit['em'] = self._increase_font_factor(-1) self._exit['h1'] = self._increase_font_factor(-7) self._exit['h2'] = self._increase_font_factor(-6) self._exit['h3'] = self._increase_font_factor(-5) self._exit['h4'] = self._increase_font_factor(-4) self._exit['h5'] = self._increase_font_factor(-3) self._exit['title'] = self._increase_font_factor(-7) # never go in and parse these tags self._ignored_tags = set([ 'meta', 'script', 'link', 'meta', 'embed', 'iframe', 'frame', 'noscript', 'object', 'svg', 'canvas', 'applet', 'frameset', 'textarea', 'style', 'area', 'map', 'base', 'basefont', 'param', ]) # set of words to ignore self._ignored_words = set([ '', 'the', 'of', 'at', 'on', 'in', 'is', 'it', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'and', 'or', ]) # TODO remove me in real version self._mock_next_doc_id = 1 self._mock_next_word_id = 1 # keep track of some info about the page we are currently parsing self._curr_depth = 0 self._curr_url = "" self._curr_doc_id = 0 self._font_size = 0 self._curr_words = None # get all urls into the queue try: with open(url_file, 'r') as f: for line in f: self._url_queue.append((self._fix_url(line.strip(), ""), 0)) except IOError: pass
#from flask.ext.restful import Api, Resource, reqparse from flask_restful import Api, Resource, reqparse from flask_restful.utils import cors #from flask.ext.restful.utils import cors #from flask.ext.cors import CORS from flask_cors import CORS from modules.ssh import QoSHandler app = Flask(__name__) CORS(app) api = Api(app) config = json.load(open('./config.json', 'r')) db_handler = DBHandler(config) class UserAPI(Resource): def __init__(self): self.reqparse = reqparse.RequestParser() super(UserAPI, self).__init__() @cors.crossdomain(origin='*') def get(self, uid): users = [] if uid != 'all': users = uid.split(',') resultset = db_handler.get_users(users)
def do_search(keywords): global user_top_20_database # Fetch the current session request_session = request.environ['beaker.session'] # Fetch the users email for their session user_email = request_session.get('user_email', 'Anonymous') if reduce(and_, map(lambda c: c in math_chars, keywords)): result = None try: result = eval( keywords.replace('^', '**').replace('[', '(').replace(']', ')')) return result_template( user_email, keywords, template(''' <p> {{keywords}} = {{result}} </p> ''', keywords=keywords, result=result)) except Exception as e: pass # A list of all keywords from the search query. keyword_list = map(str.lower, keywords.split()) keywords = keyword_list #----------------------------------------------------------------------- counted_keyword_list = [(keyword_list.count(x), x) for x in set(keyword_list)] # Sort the list in descending order of frequency. counted_keyword_list.sort(key=wordCount, reverse=1) page = request.query.get('page') if user_email <> 'anonymous' and page == None: # Fetch the top 20 list for that users email user_top_20 = user_top_20_database.get(user_email) if user_top_20 != None: # Add to the top 20 list and update totals. # Iterate through the counted keyword list. for keywords1 in counted_keyword_list: # If any keywords are already in the top 20 list, merge them into the top 20 list. if any(keywords1[1] in element for element in user_top_20): # Iterator to keep track of which keyword in the top 20 list we are at. i = 0 # Iterate through the keyword pairs and add the values from the counted_keyword_list into the top20 list. for keywords2 in user_top_20: # If the keywords match. if keywords2[1] == keywords1[1]: # Save the count value of the user_top_20 version. keyword_count = keywords2[0] # Delete the old user_top_20 keyword and count. del user_top_20[i] # Add the keyword with updated count to the front of the top_20 list. user_top_20.insert( 0, ((keywords1[0] + keyword_count), keywords1[1])) # Iterate i = i + 1 # If the word isn't already in the top 20 list add it. else: user_top_20.append(keywords1) # Organize the top 20 list in decending order by the frequency of a keyword. user_top_20.sort(key=wordCount, reverse=1) # Update the database of user search history user_top_20_database["user_email"] = user_top_20 # If the user_top_20 list is longer than 20 keywords, trim it. # while len(user_top_20) > 20: # del user_top_20[-1] #------------------------------------------------------------------------ # Grab the first keyword that was inputted by the user if keyword_list == []: results_list = [] return generate_page_results(1, results_list, [], user_email) if page == None: page = 1 else: page = int(page) db = DBHandler() # Get the word_ids through a getter in the database word_ids = [] ignored_words = set([ '', 'the', 'of', 'at', 'on', 'in', 'is', 'it', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'and', 'or', ]) for keyword in keyword_list: if keyword in ignored_words: continue word_ids.append(db.get_word_id(keyword)) # Get the doc_ids from the word_ids in the database list_of_doc_id_lists = [] for word_id in word_ids: if word_id == None: list_of_doc_id_lists.append([]) else: list_of_doc_id_lists.append(db.get_doc_ids(word_id)) # Find lists of doc_ids that intersect with each other, this will give us doc ids that contain both keywords intersecting_doc_ids = find_intersections(list_of_doc_id_lists) # Get the url_ranks from pagerank in the database ranks = db.get_pageranks(intersecting_doc_ids) # Zip the doc_ids with the corresponding url_ranks to make ranked_doc_ids ranked_doc_ids = zip(ranks, intersecting_doc_ids) # Sort the ranked_doc_ids to make sorted_doc_ids and get the sorted_urls from the database ranked_sorted_doc_ids = sorted(ranked_doc_ids, key=itemgetter(0)) results_list = map(itemgetter(0), db.get_urls(map(itemgetter(1), ranked_sorted_doc_ids))) return generate_page_results(page, results_list, keyword_list, user_email)
def setUp(self): self.db = DBHandler()