def reducer(self,key,value):
         value = list(value)
         hbase = Hbase(DB_NAME)
         sim_pages = hbase.table("mr_pages_similarity_umls")
         #insert keys and similarity values in hbase
         sim_pages.put(str(key[0]), {'paired_page:'+str(key[1]):str(value[0])})
         sim_pages.put(str(key[1]), {'paired_page:'+str(key[0]):str(value[0])})
         yield key,value
class Pages:
	def __init__(self):
		self.hbase = Hbase(DB_NAME)
		self.create_similarity_hbase_table()
		self.table = self.hbase.table("spark_pages_similarity_umls_subset")

	def create_similarity_hbase_table(self):
		self.hbase_connection = self.hbase.get_connection()
		try:
			self.hbase_connection.create_table("spark_pages_similarity_umls_subset", {'paired_page':dict()})
		except happybase.hbase.ttypes.AlreadyExists:
			pass

	def insert(self, pid1, pid2, sim):
		self.table.put(str(pid1), {'paired_page:'+str(pid2):str(sim)})
 def __init__(self):
     # initiate all databases and tables
     self.postgres_obj = DB(DB_NAME)
     self.con = self.postgres_obj.connection()
     self.cur = self.postgres_obj.cursor()
     self.hbase = Hbase(DB_NAME)
     self.create_similarity_hbase_table()
     self.table = self.hbase.table("spark_pages_similarity_umls_subset")
	def __init__(self):
		self.hbase = Hbase(DB_NAME)
		self.create_similarity_hbase_table()
		self.table = self.hbase.table("spark_pages_similarity_umls_subset")
def main():
        # test to make sure values made it into hbase
        hbase = Hbase(DB_NAME)
        sim_pages = hbase.table("mr_pages_similarity_umls")
        print sim_pages.cells('752675','paired_page',versions=20000,include_timestamp=True)
#Import stop words and table of page summaries as global variables
file = open('stop_words.txt', 'r')
file_list = []
for r in file:
        file_list.extend(r.split(','))
stop_words = set(file_list)

contentHash={}
with open('pages.json') as data_file:
    for line in data_file:
                d = dict(json.loads(line))
                contentHash[d['page_id']]=d['summary']

#Create an Hbase for similarity values
hbase = Hbase(DB_NAME)
hbase_connection = hbase.get_connection()
try:
        hbase_connection.create_table("mr_pages_similarity_umls", {'paired_page':dict()})
except happybase.hbase.ttypes.AlreadyExists:
        pass

#Start the timer
start = time.time()
startMap = 0
finish = 0

class MRPageAssociate(MRJob):
        def mapper(self,_,line):
                j= json.loads(line)
                d = dict(j)
Esempio n. 7
0
 def __init__(self):
     self.hot = DB(DB_NAME)
     self.cold = Hbase(DB_NAME)
Esempio n. 8
0
class System:
    '''
    Default Constructor
    '''
    def __init__(self):
        self.hot = DB(DB_NAME)
        self.cold = Hbase(DB_NAME)

    '''
    initialize a table
    @param name: name of table
    @param cols: list of tuples of column name and type
    '''

    def initialize(self, name, cols):
        try:
            self.cold.get_connection().create_table(name, {'default': dict()})
        except happybase.hbase.ttypes.AlreadyExists:
            self.cold.get_connection().disable_table(name)
            self.cold.get_connection().delete_table(name)
            self.cold.get_connection().create_table(name, {'default': dict()})
        self.hot.cursor().execute("drop table if exists " + name + ";")
        create_table = "create table " + name + "("
        for key in sorted(cols):
            create_table += key + ' ' + cols[key] + ","
        create_table = create_table[:len(create_table) - 1] + ");"
        self.hot.cursor().execute(create_table)
        self.hot.connection().commit()

    '''
    load data into table directly into cold storage
    @param table: name of table to be loaded
    @param data_list: list of dictionaries containing dictionaries
    '''

    def load(self, table, data_list):
        for data in data_list:
            if 'key' not in data:
                raise Exception('Error: Must define a key')
            self.cold.insert(table, data['key'], data)

    '''
    Clears Cache
    '''

    def clear(self):
        self.replacement_algorithm.printContents()
        self.replacement_algorithm.clear()

    '''
    Set which replacement algorithm to use
    @param name: name of replacement algorithm
    @param size: maximum size of hot storage
    @exception: Raised if user specifies an incorrect replacement algorithm
    '''

    def setReplacementAlgorithm(self, name, size):
        self.size = size
        if name == 'LRU':
            self.replacement_algorithm = LRUQueue(size)
        elif name == 'FIFO':
            self.replacement_algorithm = FIFOQueue(size)
        elif name == 'CLOCK1':
            self.replacement_algorithm = ClockStaticQueue(size)
        elif name == 'CLOCK2':
            self.replacement_algorithm = ClockDynamicQueue(size)
        elif name == 'RANDOM':
            self.replacement_algorithm = RandomQueue(size)
        else:
            raise Exception('Error: Choose a different replacement algorithm')

    '''
    Insert data into both hot and cold storage
    @param table: table to be inserted into
    @param data: dictionary of data to be stored
    @exception: raised if user fails to define key in their data
    '''

    def insert(self, table, data):
        #For testing purposes only
        if self.size == 0:
            self.cold.insert(table, data['key'], data)
            return
        #end testing purposes

        if 'key' not in data:
            raise Exception('Error: Must define a key')
        val = self.replacement_algorithm.enqueue(data['key'])
        if val != -1:
            self.hot.delete(table, str(val))
        self.hot.insert(table, data)
        self.hot.commit()
        self.cold.insert(table, data['key'], data)

    '''
    Query data from table
    @param table: table to be queried from
    @param key: key of data to be queried
    @param cols: list of columns to be returned
    @return: tuple of data 
    '''

    def query(self, table, key, cols):
        key = str(key)
        #for testing purposes only
        if self.size == 0:
            return self.cold.select(table, key)
        #end testing purposes

        if not self.replacement_algorithm.contains(key):
            val = self.replacement_algorithm.enqueue(
                key
            )  #returns -1 if the queue is full, the key of the data if not
            if val != -1:
                self.hot.delete(table, str(val))
            data = self.stripColumnFamily(self.cold.select(table, key))
            self.hot.insert(table, data)
            self.hot.commit()
        return self.hot.select(table, key, cols)

    '''
    Delete record from table
    @param table: table to be deleted from
    @param key: key of row to be deleted
    '''

    def delete(self, table, key):
        key = str(key)
        #for testing purposes
        if self.size == 0:
            self.cold.delete(table, key)
            return
        #end testing purposes
        if self.replacement_algorithm.delete(key):
            self.hot.delete(table, key)
            self.hot.commit()
        self.cold.delete(table, key)

    '''
    update record from table
    @param table: table to be updated
    @param key: key of the row to be updated
    @param data: dictonary of column value pairs to be updated
    @exception: thrown if the user attempts to update key
    '''

    def update(self, table, key, data):
        key = str(key)
        if 'key' in data:
            raise Exception('Cannot update key')
        #testing purposes only
        if self.size == 0:
            self.cold.update(table, key, data)
            return
        #end testing purposes
        self.cold.update(table, key, data)
        if self.replacement_algorithm.contains(key):
            self.hot.update(table, key, data)
            self.hot.connection().commit()
        else:
            val = self.replacement_algorithm.enqueue(key)
            if val != -1:
                self.hot.delete(table, str(val))
            data = self.stripColumnFamily(self.cold.select(table, key))
            self.hot.insert(table, data)
            self.hot.commit()

    '''

    '''

    def scan(self, table):
        return self.cold.scan(table)

    '''
    strip column family name from keys
    @param data: dictionary of unstripped column name to values
    @return: dictionary of stripped column names to values
    '''

    def stripColumnFamily(self, data):
        dict = {}
        for key in data:
            dict[key[8:]] = data[key]
        return dict
class Pages:
    def __init__(self):
        # initiate all databases and tables
        self.postgres_obj = DB(DB_NAME)
        self.con = self.postgres_obj.connection()
        self.cur = self.postgres_obj.cursor()
        self.hbase = Hbase(DB_NAME)
        self.create_similarity_hbase_table()
        self.table = self.hbase.table("spark_pages_similarity_umls_subset")

    def create_similarity_hbase_table(self):
        # create connection to hbase table
        self.hbase_connection = self.hbase.get_connection()
        try:
            self.hbase_connection.create_table("spark_pages_similarity_umls_subset", {"paired_page": dict()})
        except happybase.hbase.ttypes.AlreadyExists:
            pass

    def readCSV(self, file_name):
        # read CSV file
        file = open(file_name, "r")
        file_list = []
        for r in file:
            file_list.extend(r.split(","))
        return file_list

    def loadPages(self):
        # function to load the data from sql
        sql = "select page_id, summary from med_pages_umls_subset"
        self.pages_row = self.postgres_obj.query(sql)
        self.pages_rows_list = list()
        # keep the page id and summary
        for row in self.pages_row:
            self.pages_rows_list.append([row[0], row[1]])

    def pagesJaccSimilarity(self):
        # original unparallelized function for pilot project to calculate the Jaccard similarity between every page
        set_stop_words = set(self.readCSV("stop_words.txt"))
        sim_pages = self.hbase.table("pages_similarity")
        for i in range(len(self.pages_row)):
            for j in range(i + 1, len(self.pages_row)):
                # treat summary as a set of words with all stop words removed
                page1_summary = set(re.findall(r"[\w']+", self.pages_row[i][1].lower())).difference(set_stop_words)
                page2_summary = set(re.findall(r"[\w']+", self.pages_row[j][1].lower())).difference(set_stop_words)
                jacc_similarity = (1.0 * len(page2_summary.intersection(page1_summary))) / len(
                    page2_summary.union(page1_summary)
                )
                # load pages into hbase
                sim_pages.put(
                    str(self.pages_row[i][0]), {"paired_page:" + str(self.pages_row[j][0]): str(jacc_similarity)}
                )
                sim_pages.put(
                    str(self.pages_row[j][0]), {"paired_page:" + str(self.pages_row[i][0]): str(jacc_similarity)}
                )

    def get_id(self, term):
        # get id of page with title
        q = "SELECT distinct(page_id) FROM med_pages_umls_subset WHERE lower(title) LIKE '%%%s%%'" % (term.lower())
        return self.postgres_obj.query(q)[0][0]

    def get_title(self, id):
        # get title of page with id
        q = "SELECT distinct(title) FROM med_pages_umls_subset WHERE page_id = %d" % (id)
        return self.postgres_obj.query(q)[0][0].lower()

    def query_page(self, id):
        # for a page with id, get all pages and their similarities to page
        result = []
        row = self.table.row(str(id))
        for page in row:
            result.append((page[12:], row[page]))
        return result

    def getPageLinksList(self, page_id):
        # get links from link table for a page
        q = "SELECT links FROM med_pages_umls_subset_links WHERE page_id = %d" % (page_id)
        links = self.postgres_obj.query(q)
        if len(list(links)) > 0:
            return [link.lower() for link in links[0][0].split(",")]
        else:
            return []

    def getPageSummaryContent(self, page_id):
        # get summary and content from table for a page
        q = "SELECT summary, content FROM med_pages_umls_subset WHERE page_id = %d" % (page_id)
        page_info = self.postgres_obj.query(q)
        if len(list(page_info)) > 0:
            return page_info[0][0], page_info[0][1]
        else:
            return []