def reducer(self,key,value): value = list(value) hbase = Hbase(DB_NAME) sim_pages = hbase.table("mr_pages_similarity_umls") #insert keys and similarity values in hbase sim_pages.put(str(key[0]), {'paired_page:'+str(key[1]):str(value[0])}) sim_pages.put(str(key[1]), {'paired_page:'+str(key[0]):str(value[0])}) yield key,value
class Pages: def __init__(self): self.hbase = Hbase(DB_NAME) self.create_similarity_hbase_table() self.table = self.hbase.table("spark_pages_similarity_umls_subset") def create_similarity_hbase_table(self): self.hbase_connection = self.hbase.get_connection() try: self.hbase_connection.create_table("spark_pages_similarity_umls_subset", {'paired_page':dict()}) except happybase.hbase.ttypes.AlreadyExists: pass def insert(self, pid1, pid2, sim): self.table.put(str(pid1), {'paired_page:'+str(pid2):str(sim)})
def __init__(self): # initiate all databases and tables self.postgres_obj = DB(DB_NAME) self.con = self.postgres_obj.connection() self.cur = self.postgres_obj.cursor() self.hbase = Hbase(DB_NAME) self.create_similarity_hbase_table() self.table = self.hbase.table("spark_pages_similarity_umls_subset")
def __init__(self): self.hbase = Hbase(DB_NAME) self.create_similarity_hbase_table() self.table = self.hbase.table("spark_pages_similarity_umls_subset")
def main(): # test to make sure values made it into hbase hbase = Hbase(DB_NAME) sim_pages = hbase.table("mr_pages_similarity_umls") print sim_pages.cells('752675','paired_page',versions=20000,include_timestamp=True)
#Import stop words and table of page summaries as global variables file = open('stop_words.txt', 'r') file_list = [] for r in file: file_list.extend(r.split(',')) stop_words = set(file_list) contentHash={} with open('pages.json') as data_file: for line in data_file: d = dict(json.loads(line)) contentHash[d['page_id']]=d['summary'] #Create an Hbase for similarity values hbase = Hbase(DB_NAME) hbase_connection = hbase.get_connection() try: hbase_connection.create_table("mr_pages_similarity_umls", {'paired_page':dict()}) except happybase.hbase.ttypes.AlreadyExists: pass #Start the timer start = time.time() startMap = 0 finish = 0 class MRPageAssociate(MRJob): def mapper(self,_,line): j= json.loads(line) d = dict(j)
def __init__(self): self.hot = DB(DB_NAME) self.cold = Hbase(DB_NAME)
class System: ''' Default Constructor ''' def __init__(self): self.hot = DB(DB_NAME) self.cold = Hbase(DB_NAME) ''' initialize a table @param name: name of table @param cols: list of tuples of column name and type ''' def initialize(self, name, cols): try: self.cold.get_connection().create_table(name, {'default': dict()}) except happybase.hbase.ttypes.AlreadyExists: self.cold.get_connection().disable_table(name) self.cold.get_connection().delete_table(name) self.cold.get_connection().create_table(name, {'default': dict()}) self.hot.cursor().execute("drop table if exists " + name + ";") create_table = "create table " + name + "(" for key in sorted(cols): create_table += key + ' ' + cols[key] + "," create_table = create_table[:len(create_table) - 1] + ");" self.hot.cursor().execute(create_table) self.hot.connection().commit() ''' load data into table directly into cold storage @param table: name of table to be loaded @param data_list: list of dictionaries containing dictionaries ''' def load(self, table, data_list): for data in data_list: if 'key' not in data: raise Exception('Error: Must define a key') self.cold.insert(table, data['key'], data) ''' Clears Cache ''' def clear(self): self.replacement_algorithm.printContents() self.replacement_algorithm.clear() ''' Set which replacement algorithm to use @param name: name of replacement algorithm @param size: maximum size of hot storage @exception: Raised if user specifies an incorrect replacement algorithm ''' def setReplacementAlgorithm(self, name, size): self.size = size if name == 'LRU': self.replacement_algorithm = LRUQueue(size) elif name == 'FIFO': self.replacement_algorithm = FIFOQueue(size) elif name == 'CLOCK1': self.replacement_algorithm = ClockStaticQueue(size) elif name == 'CLOCK2': self.replacement_algorithm = ClockDynamicQueue(size) elif name == 'RANDOM': self.replacement_algorithm = RandomQueue(size) else: raise Exception('Error: Choose a different replacement algorithm') ''' Insert data into both hot and cold storage @param table: table to be inserted into @param data: dictionary of data to be stored @exception: raised if user fails to define key in their data ''' def insert(self, table, data): #For testing purposes only if self.size == 0: self.cold.insert(table, data['key'], data) return #end testing purposes if 'key' not in data: raise Exception('Error: Must define a key') val = self.replacement_algorithm.enqueue(data['key']) if val != -1: self.hot.delete(table, str(val)) self.hot.insert(table, data) self.hot.commit() self.cold.insert(table, data['key'], data) ''' Query data from table @param table: table to be queried from @param key: key of data to be queried @param cols: list of columns to be returned @return: tuple of data ''' def query(self, table, key, cols): key = str(key) #for testing purposes only if self.size == 0: return self.cold.select(table, key) #end testing purposes if not self.replacement_algorithm.contains(key): val = self.replacement_algorithm.enqueue( key ) #returns -1 if the queue is full, the key of the data if not if val != -1: self.hot.delete(table, str(val)) data = self.stripColumnFamily(self.cold.select(table, key)) self.hot.insert(table, data) self.hot.commit() return self.hot.select(table, key, cols) ''' Delete record from table @param table: table to be deleted from @param key: key of row to be deleted ''' def delete(self, table, key): key = str(key) #for testing purposes if self.size == 0: self.cold.delete(table, key) return #end testing purposes if self.replacement_algorithm.delete(key): self.hot.delete(table, key) self.hot.commit() self.cold.delete(table, key) ''' update record from table @param table: table to be updated @param key: key of the row to be updated @param data: dictonary of column value pairs to be updated @exception: thrown if the user attempts to update key ''' def update(self, table, key, data): key = str(key) if 'key' in data: raise Exception('Cannot update key') #testing purposes only if self.size == 0: self.cold.update(table, key, data) return #end testing purposes self.cold.update(table, key, data) if self.replacement_algorithm.contains(key): self.hot.update(table, key, data) self.hot.connection().commit() else: val = self.replacement_algorithm.enqueue(key) if val != -1: self.hot.delete(table, str(val)) data = self.stripColumnFamily(self.cold.select(table, key)) self.hot.insert(table, data) self.hot.commit() ''' ''' def scan(self, table): return self.cold.scan(table) ''' strip column family name from keys @param data: dictionary of unstripped column name to values @return: dictionary of stripped column names to values ''' def stripColumnFamily(self, data): dict = {} for key in data: dict[key[8:]] = data[key] return dict
class Pages: def __init__(self): # initiate all databases and tables self.postgres_obj = DB(DB_NAME) self.con = self.postgres_obj.connection() self.cur = self.postgres_obj.cursor() self.hbase = Hbase(DB_NAME) self.create_similarity_hbase_table() self.table = self.hbase.table("spark_pages_similarity_umls_subset") def create_similarity_hbase_table(self): # create connection to hbase table self.hbase_connection = self.hbase.get_connection() try: self.hbase_connection.create_table("spark_pages_similarity_umls_subset", {"paired_page": dict()}) except happybase.hbase.ttypes.AlreadyExists: pass def readCSV(self, file_name): # read CSV file file = open(file_name, "r") file_list = [] for r in file: file_list.extend(r.split(",")) return file_list def loadPages(self): # function to load the data from sql sql = "select page_id, summary from med_pages_umls_subset" self.pages_row = self.postgres_obj.query(sql) self.pages_rows_list = list() # keep the page id and summary for row in self.pages_row: self.pages_rows_list.append([row[0], row[1]]) def pagesJaccSimilarity(self): # original unparallelized function for pilot project to calculate the Jaccard similarity between every page set_stop_words = set(self.readCSV("stop_words.txt")) sim_pages = self.hbase.table("pages_similarity") for i in range(len(self.pages_row)): for j in range(i + 1, len(self.pages_row)): # treat summary as a set of words with all stop words removed page1_summary = set(re.findall(r"[\w']+", self.pages_row[i][1].lower())).difference(set_stop_words) page2_summary = set(re.findall(r"[\w']+", self.pages_row[j][1].lower())).difference(set_stop_words) jacc_similarity = (1.0 * len(page2_summary.intersection(page1_summary))) / len( page2_summary.union(page1_summary) ) # load pages into hbase sim_pages.put( str(self.pages_row[i][0]), {"paired_page:" + str(self.pages_row[j][0]): str(jacc_similarity)} ) sim_pages.put( str(self.pages_row[j][0]), {"paired_page:" + str(self.pages_row[i][0]): str(jacc_similarity)} ) def get_id(self, term): # get id of page with title q = "SELECT distinct(page_id) FROM med_pages_umls_subset WHERE lower(title) LIKE '%%%s%%'" % (term.lower()) return self.postgres_obj.query(q)[0][0] def get_title(self, id): # get title of page with id q = "SELECT distinct(title) FROM med_pages_umls_subset WHERE page_id = %d" % (id) return self.postgres_obj.query(q)[0][0].lower() def query_page(self, id): # for a page with id, get all pages and their similarities to page result = [] row = self.table.row(str(id)) for page in row: result.append((page[12:], row[page])) return result def getPageLinksList(self, page_id): # get links from link table for a page q = "SELECT links FROM med_pages_umls_subset_links WHERE page_id = %d" % (page_id) links = self.postgres_obj.query(q) if len(list(links)) > 0: return [link.lower() for link in links[0][0].split(",")] else: return [] def getPageSummaryContent(self, page_id): # get summary and content from table for a page q = "SELECT summary, content FROM med_pages_umls_subset WHERE page_id = %d" % (page_id) page_info = self.postgres_obj.query(q) if len(list(page_info)) > 0: return page_info[0][0], page_info[0][1] else: return []