예제 #1
0
파일: engine.py 프로젝트: hoov/flywheel
    def refresh(self, items, consistent=False):
        """
        Overwrite model data with freshest from database

        Parameters
        ----------
        items : list or :class:`~flywheel.models.Model`
            Models to sync
        consistent : bool, optional
            If True, force a consistent read from the db. (default False)

        """
        if isinstance(items, Model):
            items = [items]
        if not items:
            return

        tables = defaultdict(list)
        for item in items:
            tables[item.meta_.ddb_tablename].append(item)

        for tablename, items in tables.iteritems():
            table = Table(tablename, connection=self.dynamo)
            keys = [item.pk_dict_ for item in items]
            results = table.batch_get(keys, consistent=consistent)
            for item, data in itertools.izip(items, results):
                with item.loading_(self):
                    for key, val in data.items():
                        item.set_ddb_val_(key, val)
예제 #2
0
class Comment(object):
    def __init__(self):
        self.table = Table('Comment')

    def create(self, comment_id, comment_info):
        data = dict(
            comment_id=comment_id,
            info=comment_info
        )
        self.table.put_item(data=data)

    def get(self, comment_id):
        comment = self.table.get_item(comment_id=comment_id)
        comment_info = comment['info']
        return comment_info

    def update(self, comment_id, comment_info):
        comment = self.table.get_item(comment_id=comment_id)
        comment['info'] = comment_info
        comment.save()

    def batch_query(self, comment_id_list):
        keys = []
        for comment_id in comment_id_list:
            keys.append(dict(
                comment_id=comment_id
            ))
        many_comments = self.table.batch_get(keys=keys)
        comment_info_list = []
        for comment in many_comments:
            comment_info_list.append(comment['info'])
        return comment_info_list
예제 #3
0
class Post(object):
    def __init__(self):
        self.table = Table('Post')

    def create(self, post_id, post_info):
        data = dict(
            post_id=post_id,
            info=post_info
        )
        self.table.put_item(data=data)

    def get(self, post_id):
        post = self.table.get_item(post_id=post_id)
        post_info = post['info']
        return post_info

    def update(self, post_id, post_info):
        post = self.table.get_item(post_id=post_id)
        post['info'] = post_info
        post.save()

    def batch_query(self, post_id_list):
        keys = []
        for post_id in post_id_list:
            keys.append(dict(
                post_id=post_id
            ))
        many_posts = self.table.batch_get(keys=keys)
        post_info_list = []
        for post in many_posts:
            post_info_list.append(post['info'])
        return post_info_list
예제 #4
0
class Collection(CollectionEngine):

    def __init__(self, name, table_name, region, host=None, is_secure=None, port=None):
        kwargs = {}
        if host is not None:
            kwargs['host'] = host
        if is_secure is not None:
            kwargs['is_secure'] = is_secure
        if port is not None:
            kwargs['port'] = port
        self.__table = Table(
            table_name=table_name,
            connection=boto.dynamodb2.connect_to_region(region, **kwargs),
        )

    @property
    def table(self):
        return self.__table

    def create_raw_item(self, index, data_dict, context):
        if context is None:
            self.__table.put_item(data_dict)
        else:
            context.put_item(self.__table, data_dict)
        return BotoItem(self.__table, data_dict, True)

    def retrieve_raw_item(self, key_dict):
        try:
            return self.__table.get_item(**key_dict)
        except ItemNotFound:
            raise KeyError(key_dict)
        except:
            raise

    def query_raw_items(self, index, parent_key_value, **kwargs):
        if parent_key_value is not None:
            kwargs['{}__eq'.format(parent_key_value[0])] = parent_key_value[1]
        return self.__table.query(index=index, **kwargs)

    def bulk_get_raw_items(self, **kwargs):
        return self.__table.batch_get(**kwargs)

    @classmethod
    def get_context(cls, *args, **kwargs):
        return Context()
예제 #5
0
class Event(object):
    def __init__(self):
        self.table_event = Table('Event')

    def create_new_event(self, event_id, event_info):
        print(event_id)
        data = dict(
            event_id=event_id,
            info=event_info
        )
        self.table_event.put_item(data=data)

    def get_event_info_by_event_id(self, event_id):
        event = self.table_event.get_item(event_id=event_id)
        event_info = event['info']
        return event_info

    def update_event_info_by_event_id(self, event_id, event_info):
        event = self.table_event.get_item(event_id=event_id)
        event['info'] = event_info
        event.save()

    def batch_query_by_event_id_list(self, event_id_list):
        keys = []
        for event_id in event_id_list:
            keys.append(dict(
                event_id=event_id
            ))
        many_events = self.table_event.batch_get(keys=keys)
        event_info_list = []
        for event in many_events:
            event_info_list.append(dict(
                event_id=event['event_id'],
                info=event['info']
            ))
        return event_info_list
class DynamoDBAdapter(key_value_store.KeyValueStore):

    """ Implementation of an abstract key-value store defined in
    key_value_store.py. The underlying database is amazon DynamoDB.

    The store keeps all objects in a single table with following schema:
    [HashKey('kind', data_type=STRING), RangeKey('id')]. 'kind' is the string
    with the object type ('vector', 'set' or 'int') and 'id' is the object id.
    The object value is stored in the 'value' attribute of the table items.

    The table should be created before this code is executed. Amazon
    configuration is assumed to be stored in ~/.boto file as described in
    http://boto.readthedocs.org/en/latest/boto_config_tut.html
    """

    def __init__(self, precision=np.dtype('float32'), table_name='test'):
        """ Create an instance of the dynamodb key-value store.
        precision - a numpy type, elements of all vectors are converted and
           stored in this type;
        table_name - the name of the DynamoDB table which keeps the objects.
        """
        conn = boto.dynamodb2.connect_to_region('eu-west-1')
        if not isinstance(precision, np.dtype):
            raise TypeError("Precision should be a numpy.dtype subtype")
        self.precision = precision
        self.precision_name = precision.name
        self.table = Table(table_name, connection=conn)

    def _get_or_create_item(self, kind, item_id):
        try:
            item = self.table.get_item(kind=kind, id=item_id)
        except ItemNotFound:
            item = Item(self.table)
            item['kind'] = kind
            item['id'] = item_id
        return item

    def _create_vector_item(self, vec_id, vector):
        item = self._get_or_create_item('vector', vec_id)
        item['value'] = Binary(vector.astype(self.precision).tostring())
        item['precision'] = self.precision_name
        return item

    def _vector_value(self, item):
        return np.fromstring(str(item['value']), np.dtype(item['precision']))

    def get_vector_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='vector')]

    def get_int_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='int')]

    def get_set_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='set')]

    def store_vector(self, vec_id, vector):
        item = self._create_vector_item(vec_id, vector)
        item.save()

    def get_vector(self, vec_id):
        try:
            item = self.table.get_item(kind='vector', id=vec_id)
        except ItemNotFound:
            raise KeyError('Vector key %s is unknown' % (vec_id,))
        return self._vector_value(item)

    def bulk_get_vector(self, vec_ids):
        keys = [{'kind': 'vector', 'id': i} for i in vec_ids]
        vs = self.table.batch_get(keys=keys)
        return [self._vector_value(i) for i in vs]

    def remove_vector(self, vec_id):
        try:
            item = self.table.get_item(kind='vector', id=vec_id)
        except ItemNotFound:
            raise KeyError('Vector key %s is unknown' % (vec_id,))
        item.delete()

    def add_to_set(self, set_id, element_id):
        item = self._get_or_create_item('set', set_id)
        if 'value' not in item.keys() or not isinstance(item['value'], set):
            item['value'] = set()
        item['value'].add(element_id)
        item.save(overwrite=True)

    def remove_from_set(self, set_id, element_id):
        try:
            item = self.table.get_item(kind='set', id=set_id)
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id,))
        if 'value' not in item.keys() or not isinstance(item['value'], set):
            raise KeyError('Incorrect value in item %s' % (set_id,))
        if element_id not in item['value']:
            raise KeyError('Element %s not in set %s' % (element_id, set_id))
        item['value'].remove(element_id)
        item.save()

    def remove_set(self, set_id):
        try:
            item = self.table.get_item(kind='set', id=set_id)
            item.delete()
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id,))

    def get_set(self, set_id):
        try:
            the_set = self.table.get_item(kind='set', id=set_id)['value']
            return set([str(entry) for entry in the_set])
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id,))

    def store_int(self, int_id, integer):
        item = self._get_or_create_item('int', int_id)
        item['value'] = integer
        item.save()

    def get_int(self, int_id):
        try:
            return int(self.table.get_item(kind='int', id=int_id)['value'])
        except ItemNotFound:
            raise KeyError('Int key %s is unknown' % (int_id,))

    def remove_int(self, int_id):
        try:
            item = self.table.get_item(kind='int', id=int_id)
        except ItemNotFound:
            raise KeyError('Int key %s is unknown' % (int_id,))
        item.delete()

    def _aggregate_set_id_element_pairs(self, setpairs):
        """Turns a list of pairs of the form (set_id, element_id) into a list 'L' of
        pairs 'p' of the form (set_id, set_of_element_ids). 'L' has the property
        that if 'p' and 'q' are distinct entries in 'L', then p[0] and q[0] are
        also distinct."""
        set_ids = set([entry[0] for entry in setpairs])
        listlist = [[entry for entry in setpairs if entry[0] == set_id]
                    for set_id in set_ids]
        result = [(pairlist[0][0], set([entry[1] for entry in pairlist]))
                  for pairlist in listlist]
        return result

    def bulk_store_vector(self, vec_ids, vectors):
        if len(vec_ids) != len(vectors):
            raise ValueError
        vecpairs = zip(vec_ids, vectors)
        with self.table.batch_write() as batch:
            for vec_id, vec in vecpairs:
                item = self._create_vector_item(vec_id, vec)
                batch.put_item(item)

    def bulk_store_vector_old(self, vectors_df):
        """Argument 'vectors' is a dataframe with index vector ids."""
        if len(vec_ids) != len(vectors):
            raise ValueError
        with self.table.batch_write() as batch:
            for ind in vectors_df.index:
                vec_id = str(ind)
                vec = vectors_df.loc[ind].values
                item = self._create_vector_item(vec_id, vec)
                batch.put_item(item)

    def bulk_store_int(self, int_ids, integers):
        """Argument 'intpairs' is a list of pairs of the form (int_id, integer)."""
        if len(int_ids) != len(integers):
            raise ValueError
        intpairs = zip(int_ids, integers)
        with self.table.batch_write() as batch:
            for pair in intpairs:
                int_id, integer = pair
                item = self._get_or_create_item('int', int_id)
                item['value'] = integer
                batch.put_item(item)

    def bulk_add_to_set(self, set_ids, element_ids):
        """batch_write() objects if the same item is written to more
        than once per batch, hence we aggregate all (set_id, element_id)
        pairs into a list of pairs (set_id, element_ids), where
        the 'set_id's are pairwise distinct, and the 'element_ids'
        are sets."""
        if len(set_ids) != len(element_ids):
            raise ValueError
        setpairs = zip(set_ids, element_ids)
        setlist = self._aggregate_set_id_element_pairs(setpairs)
        with self.table.batch_write() as batch:
            for pair in setlist:
                set_id, element_ids = pair
                item = self._get_or_create_item('set', set_id)
                if 'value' not in item.keys() or not isinstance(
                        item['value'], set):
                    item['value'] = set()
                item['value'].update(element_ids)
                batch.put_item(item)
예제 #7
0
class DDBSlurps(Dynamo):
    @classmethod
    def from_test_mode(cls, access_key=None, secret=None):
        """
        Use this for getting an instance of this class that uses test tables.
        """
        instance = cls(access_key, secret)
        instance.slurps_table = Table('test_slurps',
                                      connection=instance.connection)
        instance.failed_slurps_table = Table('test_failed_slurps',
                                             connection=instance.connection)
        return instance

    def __init__(self, access_key=None, secret=None):
        """
        ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables
        """
        super(DDBSlurps, self).__init__(access_key, secret)

        self.slurps_table = Table('slurps', connection=self.connection)
        self.failed_slurps_table = Table('failed_slurps',
                                         connection=self.connection)

    def save_slurp_info(self, slurp_info_, overwrite=True):
        """
        slurp_info_ can either be in the form of a list of dicts or else a single dict.
        If slurp_info is a list, batch write will be used
        """
        if isinstance(slurp_info_, dict):
            self.slurps_table.put_item(slurp_info_, overwrite=overwrite)
        elif isinstance(slurp_info_, list):
            with self.slurps_table.batch_write() as batch:
                for s in slurp_info_:
                    batch.put_item(data=s, overwrite=overwrite)
        else:
            raise TypeError, "slurp_info must be a dict or a list of dicts, not a {}".format(
                type(slurp_info_))

    def save_failed_slurp(self, searchterm):
        self.failed_slurps_table.put_item(data={
            'searchterm':
            searchterm,
            'datetime':
            datetime.now().isoformat()
        },
                                          overwrite=True)

    def get_slurp_info(self, search_term_=None):
        """
        search_term_ can be either a string or a list of strings. Each string should be a search term you are looking
        for in the db.
        Returns either a single list of key-value tuples (if search_term_ was a string)
        or a list of key-value tuples (if search_term_ was a list)
        Each list of key-value tuples can easily be converted to a dict or an OrderedDict by the client.
        """

        # searchterm_ is a STRING
        if isinstance(search_term_, basestring):
            if search_term_:
                slurp_info = (self.slurps_table.get_item(
                    searchterm=search_term_)).items()
            else:
                slurp_info = []

        # searchterm is a LIST of strings
        elif isinstance(search_term_, list):
            if search_term_:
                # create a set of non-empty searchterms. We us a set to avoid a duplicate query error from the db
                set_of_sts = {st for st in search_term_ if st}
                # create a list of dicts from the set
                list_of_st_dicts = [{'searchterm': st} for st in set_of_sts]
                res = self.slurps_table.batch_get(list_of_st_dicts)
                try:
                    slurp_info = [i.items() for i in res]
                except (StopIteration, IndexError):
                    # If res is empty, we get one of these errors when trying to iterate.
                    slurp_info = []
            else:
                slurp_info = []

        # searchterm is an unexpected type
        else:
            raise TypeError, "search_term_ must be a dict or a list of dicts, not a {}".format(
                type(search_term_))

        return slurp_info

    def existing_and_missing_uni(self, searchterm_list):
        """
        Takes a list of searchterm strings and returns a list of searchterm strings that were found in the db (in unicode)
        and a list of the searchterms that were missing from the found results
        """
        # make sure in utf8 before we send request to the db
        input_sts_utf8 = [to_utf8_or_bust(i) for i in searchterm_list]
        found_sts_info = self.get_slurp_info(input_sts_utf8)
        found_sts_uni = [
            to_unicode_or_bust(dict(i)['searchterm']) for i in found_sts_info
        ]
        input_sts_uni = [to_unicode_or_bust(i) for i in input_sts_utf8]
        missing_sts_uni = order_conserving.setdiff(input_sts_uni,
                                                   found_sts_uni)
        return found_sts_uni, missing_sts_uni

    def get_table(self, table_name):
        """
        Convenience method for client who may wish to get a specific table from the DynamoDB connection
        """
        return Table(table_name, connection=self.connection)

    def truncate_failed_slurp_table(self):
        """
        """
        with self.failed_slurps_table.batch_write() as batch:
            for item in self.failed_slurps_table.scan():
                batch.delete_item(searchterm=item['searchterm'])

    def truncate_slurp_table(self):
        """
        WARNING! Only use for test mode table
        """
        assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually"
        test_slurps_table = Table('test_slurps', connection=self.connection)
        with test_slurps_table.batch_write() as batch:
            for item in self.slurps_table.scan():
                batch.delete_item(searchterm=item['searchterm'])

    def modify_failed_slurps_throughput(self, requested_read, requested_write):
        return self.modify_throughput(requested_read, requested_write,
                                      self.failed_slurps_table)

    def modify_slurps_throughput(self, requested_read, requested_write):
        return self.modify_throughput(requested_read, requested_write,
                                      self.slurps_table)

    def get_slurps_table_info(self):
        return self.get_table_info(self.slurps_table)

    def get_failed_slurps_table_info(self):
        return self.get_table_info(self.failed_slurps_table)
###########################
## Batch Reading
###########################

# Similar to batch writing, batch reading can also help reduce the number of API
#  requests necessary to access a large number of items.

from boto.dynamodb2.table import Table
users = Table('users2', connection=conn)
many_users = users.batch_get(keys=[
    {
        'account_type': 'standard_user',
        'last_name': 'Doe'
    },
    {
        'account_type': 'standard_user',
        'last_name': 'Doering'
    },
    {
        'account_type': 'super_user',
        'last_name': 'Liddel'
    },
])
for user in many_users:
    print user['first_name']

###########################
## Table Scan
###########################
from boto.dynamodb2.table import Table
users = Table('users2', connection=conn)
scan_results = users.scan(birthday__ne='null', limit=50)
예제 #9
0
    def __init__(self, term, database, score):

        # create variables for storing data of interest
        self.lexdiv = []
        self.subjectivity = []
        self.polarity = []
        # put both maxdate and mindate in formats that can be compared to later with dates outside the range
        self.origmaxdate = arrow.now().replace(years=-5)
        self.maxdate = self.origmaxdate.format("YYYY-MM-DD HH:mm")
        self.origmindate = arrow.now()
        self.mindate = self.origmindate.format("YYYY-MM-DD HH:mm")
        self.articleIDs = []
        self.term = term
        self.source = database
        self.score = score
        self.articleIDsub = []
        self.IDtracker = []
        self.dayscount = {}
        # tables times are stored differently.  This dictionary contains all possibilties and the correct one is selected using
        # self.source as a key
        self.dtstringdict = {
            "DailyBeast": "MM-DD-YY h:mm",
            "CNN": "MMM-D-YYYY h:mm",
            "Huffington_Post": "YYYY-MM-DD HH:mm:ss",
            "Washington_Post": "YYYY-MM-DD HH:mm:ss",
            "RollingStone": "YYYY-MM-DD HH:mm:ssZZ",
            "SF_Gate": "YYYY-MM-DD HH:mm:ss",
        }
        self.dtstring = self.dtstringdict[self.source]

        # table indices are stored differently.  This dictionary contains all possibilties and the correct one is selected using
        # self.source as a key
        self.indexdict = {
            "DailyBeast": "tstamp",
            "CNN": "tstamp",
            "Huffington_Post": "id",
            "Washington_Post": "id",
            "RollingStone": "id",
            "SF_Gate": "id",
        }
        self.index = self.indexdict[self.source]

        # the name of columns that have dates are different.  This dictionary selects the correct one.

        self.dtdict = {
            "DailyBeast": "datetime",
            "CNN": "datetime",
            "Huffington_Post": "date",
            "Washington_Post": "date",
            "RollingStone": "date",
            "SF_Gate": "date",
        }
        self.dtstr = self.dtdict[self.source]

        # These data sources were set up without a RangeKey on Dyanmo.  Therefore to use batch get, there is a slightly
        # different syntax that needs to be used.  A later if/then statement does thsis.
        self.norangesources = ["Huffington_Post", "Washington_Post", "RollingStone", "SF_Gate"]

        # connect to dynamodb
        conn = boto.dynamodb2.connect_to_region("us-west-2", aws_access_key_id="", aws_secret_access_key="")

        # Create table variables.
        table = Table(self.source, connection=conn)
        querytable = Table("IndexTdIdf", connection=conn)

        # Collect all articleIDs from IndexTDIDF table that match the term given and the desired score
        # puts in dictionaries of 100 for batch get in a list

        for row in querytable.scan():
            if (
                row["word"] == self.term
                and row["source"] == self.source
                and row["tdIdfRoundTo7"] > self.score
                and int(float(row["articleId"])) not in self.IDtracker
            ):
                # this is the code where some  take hash and range key and others just hash key to do batch get
                if self.source in self.norangesources:
                    dictvalue = {self.index: int(float(row["articleId"]))}
                else:
                    dictvalue = {self.index: int(float(row["articleId"])), "source": self.source}
                self.articleIDsub.append(dictvalue)
                self.IDtracker.append(int(float(row["articleId"])))
                if len(self.articleIDsub) == 100:
                    self.articleIDs.append(self.articleIDsub)
                    self.articleIDsub = []
                else:
                    continue
        # make to append any left over that is not in a group of 100
        self.articleIDs.append(self.articleIDsub)

        for entry in self.articleIDs:
            results = table.batch_get(keys=entry)
            for result in results:
                if result[self.dtstr] is not None or "":
                    try:
                        tm = arrow.get(result[self.dtstr], self.dtstring)
                        tm = tm.format("YYYY-MM-DD")
                    except:
                        continue

                    if tm in self.dayscount.keys():
                        self.dayscount[tm] += 1
                    else:
                        self.dayscount[tm] = 1

                    if tm > self.maxdate:
                        self.maxdate = result[self.dtstr]
                    if tm < self.mindate:
                        self.mindate = result[self.dtstr]

                if (result["lexical diversity"]) is not None or 0:
                    self.lexdiv.append(result["lexical diversity"])

                if (result["subjectivity"]) is not None or 0:
                    self.subjectivity.append(result["subjectivity"])

                if (result["polarity"]) is not None or 0:
                    self.polarity.append(result["polarity"])
예제 #10
0
    def __init__(self, term, database, score):

        #create variables for storing data of interest
        self.lexdiv = []
        self.subjectivity = []
        self.polarity = []
        #put both maxdate and mindate in formats that can be compared to later with dates outside the range
        self.origmaxdate = arrow.now().replace(years=-5)
        self.maxdate = self.origmaxdate.format('YYYY-MM-DD HH:mm')
        self.origmindate = arrow.now()
        self.mindate = self.origmindate.format('YYYY-MM-DD HH:mm')
        self.articleIDs = []
        self.term = term
        self.source = database
        self.score = score
        self.articleIDsub = []
        self.IDtracker = []
        self.dayscount = {}
        #tables times are stored differently.  This dictionary contains all possibilties and the correct one is selected using
        #self.source as a key
        self.dtstringdict = {
            'DailyBeast': "MM-DD-YY h:mm",
            'CNN': "MMM-D-YYYY h:mm",
            "Huffington_Post": "YYYY-MM-DD HH:mm:ss",
            "Washington_Post": "YYYY-MM-DD HH:mm:ss",
            "RollingStone": "YYYY-MM-DD HH:mm:ssZZ",
            "SF_Gate": "YYYY-MM-DD HH:mm:ss"
        }
        self.dtstring = self.dtstringdict[self.source]

        #table indices are stored differently.  This dictionary contains all possibilties and the correct one is selected using
        #self.source as a key
        self.indexdict = {
            'DailyBeast': "tstamp",
            'CNN': "tstamp",
            "Huffington_Post": "id",
            "Washington_Post": "id",
            "RollingStone": "id",
            "SF_Gate": "id"
        }
        self.index = self.indexdict[self.source]

        #the name of columns that have dates are different.  This dictionary selects the correct one.

        self.dtdict = {
            'DailyBeast': "datetime",
            'CNN': "datetime",
            "Huffington_Post": "date",
            "Washington_Post": "date",
            "RollingStone": "date",
            "SF_Gate": "date"
        }
        self.dtstr = self.dtdict[self.source]

        #These data sources were set up without a RangeKey on Dyanmo.  Therefore to use batch get, there is a slightly
        #different syntax that needs to be used.  A later if/then statement does thsis.
        self.norangesources = [
            "Huffington_Post", "Washington_Post", "RollingStone", "SF_Gate"
        ]

        #connect to dynamodb
        conn = boto.dynamodb2.connect_to_region('us-west-2',
                                                aws_access_key_id='',
                                                aws_secret_access_key='')

        #Create table variables.
        table = Table(self.source, connection=conn)
        querytable = Table('IndexTdIdf', connection=conn)

        #Collect all articleIDs from IndexTDIDF table that match the term given and the desired score
        #puts in dictionaries of 100 for batch get in a list

        for row in querytable.scan():
            if row["word"] == self.term and row[
                    "source"] == self.source and row[
                        "tdIdfRoundTo7"] > self.score and int(
                            float(row["articleId"])) not in self.IDtracker:
                #this is the code where some  take hash and range key and others just hash key to do batch get
                if self.source in self.norangesources:
                    dictvalue = {self.index: int(float(row["articleId"]))}
                else:
                    dictvalue = {
                        self.index: int(float(row["articleId"])),
                        'source': self.source
                    }
                self.articleIDsub.append(dictvalue)
                self.IDtracker.append(int(float(row["articleId"])))
                if len(self.articleIDsub) == 100:
                    self.articleIDs.append(self.articleIDsub)
                    self.articleIDsub = []
                else:
                    continue
        #make to append any left over that is not in a group of 100
        self.articleIDs.append(self.articleIDsub)

        for entry in self.articleIDs:
            results = table.batch_get(keys=entry)
            for result in results:
                if result[self.dtstr] is not None or '':
                    try:
                        tm = arrow.get(result[self.dtstr], self.dtstring)
                        tm = tm.format('YYYY-MM-DD')
                    except:
                        continue

                    if tm in self.dayscount.keys():
                        self.dayscount[tm] += 1
                    else:
                        self.dayscount[tm] = 1

                    if tm > self.maxdate:
                        self.maxdate = result[self.dtstr]
                    if tm < self.mindate:
                        self.mindate = result[self.dtstr]

                if (result['lexical diversity']) is not None or 0:
                    self.lexdiv.append(result['lexical diversity'])

                if (result["subjectivity"]) is not None or 0:
                    self.subjectivity.append(result["subjectivity"])

                if (result["polarity"]) is not None or 0:
                    self.polarity.append(result["polarity"])
예제 #11
0
class DynamoDBAdapter(key_value_store.KeyValueStore):
    """ Implementation of an abstract key-value store defined in
    key_value_store.py. The underlying database is amazon DynamoDB.

    The store keeps all objects in a single table with following schema:
    [HashKey('kind', data_type=STRING), RangeKey('id')]. 'kind' is the string
    with the object type ('vector', 'set' or 'int') and 'id' is the object id.
    The object value is stored in the 'value' attribute of the table items.

    The table should be created before this code is executed. Amazon
    configuration is assumed to be stored in ~/.boto file as described in
    http://boto.readthedocs.org/en/latest/boto_config_tut.html
    """
    def __init__(self, precision=np.dtype('float32'), table_name='test'):
        """ Create an instance of the dynamodb key-value store.
        precision - a numpy type, elements of all vectors are converted and
           stored in this type;
        table_name - the name of the DynamoDB table which keeps the objects.
        """
        conn = boto.dynamodb2.connect_to_region('eu-west-1')
        if not isinstance(precision, np.dtype):
            raise TypeError("Precision should be a numpy.dtype subtype")
        self.precision = precision
        self.precision_name = precision.name
        self.table = Table(table_name, connection=conn)

    def _get_or_create_item(self, kind, item_id):
        try:
            item = self.table.get_item(kind=kind, id=item_id)
        except ItemNotFound:
            item = Item(self.table)
            item['kind'] = kind
            item['id'] = item_id
        return item

    def _create_vector_item(self, vec_id, vector):
        item = self._get_or_create_item('vector', vec_id)
        item['value'] = Binary(vector.astype(self.precision).tostring())
        item['precision'] = self.precision_name
        return item

    def _vector_value(self, item):
        return np.fromstring(str(item['value']), np.dtype(item['precision']))

    def get_vector_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='vector')]

    def get_int_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='int')]

    def get_set_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='set')]

    def store_vector(self, vec_id, vector):
        item = self._create_vector_item(vec_id, vector)
        item.save()

    def get_vector(self, vec_id):
        try:
            item = self.table.get_item(kind='vector', id=vec_id)
        except ItemNotFound:
            raise KeyError('Vector key %s is unknown' % (vec_id, ))
        return self._vector_value(item)

    def bulk_get_vector(self, vec_ids):
        keys = [{'kind': 'vector', 'id': i} for i in vec_ids]
        vs = self.table.batch_get(keys=keys)
        return [self._vector_value(i) for i in vs]

    def remove_vector(self, vec_id):
        try:
            item = self.table.get_item(kind='vector', id=vec_id)
        except ItemNotFound:
            raise KeyError('Vector key %s is unknown' % (vec_id, ))
        item.delete()

    def add_to_set(self, set_id, element_id):
        item = self._get_or_create_item('set', set_id)
        if 'value' not in item.keys() or not isinstance(item['value'], set):
            item['value'] = set()
        item['value'].add(element_id)
        item.save(overwrite=True)

    def remove_from_set(self, set_id, element_id):
        try:
            item = self.table.get_item(kind='set', id=set_id)
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id, ))
        if 'value' not in item.keys() or not isinstance(item['value'], set):
            raise KeyError('Incorrect value in item %s' % (set_id, ))
        if element_id not in item['value']:
            raise KeyError('Element %s not in set %s' % (element_id, set_id))
        item['value'].remove(element_id)
        item.save()

    def remove_set(self, set_id):
        try:
            item = self.table.get_item(kind='set', id=set_id)
            item.delete()
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id, ))

    def get_set(self, set_id):
        try:
            the_set = self.table.get_item(kind='set', id=set_id)['value']
            return set([str(entry) for entry in the_set])
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id, ))

    def store_int(self, int_id, integer):
        item = self._get_or_create_item('int', int_id)
        item['value'] = integer
        item.save()

    def get_int(self, int_id):
        try:
            return int(self.table.get_item(kind='int', id=int_id)['value'])
        except ItemNotFound:
            raise KeyError('Int key %s is unknown' % (int_id, ))

    def remove_int(self, int_id):
        try:
            item = self.table.get_item(kind='int', id=int_id)
        except ItemNotFound:
            raise KeyError('Int key %s is unknown' % (int_id, ))
        item.delete()

    def _aggregate_set_id_element_pairs(self, setpairs):
        """Turns a list of pairs of the form (set_id, element_id) into a list 'L' of
        pairs 'p' of the form (set_id, set_of_element_ids). 'L' has the property
        that if 'p' and 'q' are distinct entries in 'L', then p[0] and q[0] are
        also distinct."""
        set_ids = set([entry[0] for entry in setpairs])
        listlist = [[entry for entry in setpairs if entry[0] == set_id]
                    for set_id in set_ids]
        result = [(pairlist[0][0], set([entry[1] for entry in pairlist]))
                  for pairlist in listlist]
        return result

    def bulk_store_vector(self, vec_ids, vectors):
        if len(vec_ids) != len(vectors):
            raise ValueError
        vecpairs = zip(vec_ids, vectors)
        with self.table.batch_write() as batch:
            for vec_id, vec in vecpairs:
                item = self._create_vector_item(vec_id, vec)
                batch.put_item(item)

    def bulk_store_vector_old(self, vectors_df):
        """Argument 'vectors' is a dataframe with index vector ids."""
        if len(vec_ids) != len(vectors):
            raise ValueError
        with self.table.batch_write() as batch:
            for ind in vectors_df.index:
                vec_id = str(ind)
                vec = vectors_df.loc[ind].values
                item = self._create_vector_item(vec_id, vec)
                batch.put_item(item)

    def bulk_store_int(self, int_ids, integers):
        """Argument 'intpairs' is a list of pairs of the form (int_id, integer)."""
        if len(int_ids) != len(integers):
            raise ValueError
        intpairs = zip(int_ids, integers)
        with self.table.batch_write() as batch:
            for pair in intpairs:
                int_id, integer = pair
                item = self._get_or_create_item('int', int_id)
                item['value'] = integer
                batch.put_item(item)

    def bulk_add_to_set(self, set_ids, element_ids):
        """batch_write() objects if the same item is written to more
        than once per batch, hence we aggregate all (set_id, element_id)
        pairs into a list of pairs (set_id, element_ids), where
        the 'set_id's are pairwise distinct, and the 'element_ids'
        are sets."""
        if len(set_ids) != len(element_ids):
            raise ValueError
        setpairs = zip(set_ids, element_ids)
        setlist = self._aggregate_set_id_element_pairs(setpairs)
        with self.table.batch_write() as batch:
            for pair in setlist:
                set_id, element_ids = pair
                item = self._get_or_create_item('set', set_id)
                if 'value' not in item.keys() or not isinstance(
                        item['value'], set):
                    item['value'] = set()
                item['value'].update(element_ids)
                batch.put_item(item)
예제 #12
0
파일: ddb_slurps.py 프로젝트: yz-/ut
class DDBSlurps(Dynamo):

    @classmethod
    def from_test_mode(cls, access_key=None, secret=None):
        """
        Use this for getting an instance of this class that uses test tables.
        """
        instance = cls(access_key, secret)
        instance.slurps_table = Table('test_slurps', connection=instance.connection)
        instance.failed_slurps_table = Table('test_failed_slurps', connection=instance.connection)
        return instance

    def __init__(self, access_key=None, secret=None):
        """
        ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables
        """
        super(DDBSlurps, self).__init__(access_key, secret)

        self.slurps_table = Table('slurps', connection=self.connection)
        self.failed_slurps_table = Table('failed_slurps', connection=self.connection)

    def save_slurp_info(self, slurp_info_, overwrite=True):
        """
        slurp_info_ can either be in the form of a list of dicts or else a single dict.
        If slurp_info is a list, batch write will be used
        """
        if isinstance(slurp_info_, dict):
            self.slurps_table.put_item(slurp_info_, overwrite=overwrite)
        elif isinstance(slurp_info_, list):
            with self.slurps_table.batch_write() as batch:
                for s in slurp_info_:
                    batch.put_item(data=s, overwrite=overwrite)
        else:
            raise TypeError, "slurp_info must be a dict or a list of dicts, not a {}".format(type(slurp_info_))

    def save_failed_slurp(self, searchterm):
        self.failed_slurps_table.put_item(data={'searchterm': searchterm, 'datetime': datetime.now().isoformat()},
                                          overwrite=True)

    def get_slurp_info(self, search_term_=None):
        """
        search_term_ can be either a string or a list of strings. Each string should be a search term you are looking
        for in the db.
        Returns either a single list of key-value tuples (if search_term_ was a string)
        or a list of key-value tuples (if search_term_ was a list)
        Each list of key-value tuples can easily be converted to a dict or an OrderedDict by the client.
        """

        # searchterm_ is a STRING
        if isinstance(search_term_, basestring):
            if search_term_:
                slurp_info = (self.slurps_table.get_item(searchterm=search_term_)).items()
            else:
                slurp_info = []

        # searchterm is a LIST of strings
        elif isinstance(search_term_, list):
            if search_term_:
                # create a set of non-empty searchterms. We us a set to avoid a duplicate query error from the db
                set_of_sts = {st for st in search_term_ if st}
                # create a list of dicts from the set
                list_of_st_dicts = [{'searchterm': st} for st in set_of_sts]
                res = self.slurps_table.batch_get(list_of_st_dicts)
                try:
                    slurp_info = [i.items() for i in res]
                except (StopIteration, IndexError):
                    # If res is empty, we get one of these errors when trying to iterate.
                    slurp_info = []
            else:
                slurp_info = []

        # searchterm is an unexpected type
        else:
            raise TypeError, "search_term_ must be a dict or a list of dicts, not a {}".format(type(search_term_))

        return slurp_info

    def existing_and_missing_uni(self, searchterm_list):
        """
        Takes a list of searchterm strings and returns a list of searchterm strings that were found in the db (in unicode)
        and a list of the searchterms that were missing from the found results
        """
        # make sure in utf8 before we send request to the db
        input_sts_utf8 = [to_utf8_or_bust(i) for i in searchterm_list]
        found_sts_info = self.get_slurp_info(input_sts_utf8)
        found_sts_uni = [to_unicode_or_bust(dict(i)['searchterm']) for i in found_sts_info]
        input_sts_uni = [to_unicode_or_bust(i) for i in input_sts_utf8]
        missing_sts_uni = order_conserving.setdiff(input_sts_uni, found_sts_uni)
        return found_sts_uni, missing_sts_uni

    def get_table(self, table_name):
        """
        Convenience method for client who may wish to get a specific table from the DynamoDB connection
        """
        return Table(table_name, connection=self.connection)

    def truncate_failed_slurp_table(self):
        """
        """
        with self.failed_slurps_table.batch_write() as batch:
            for item in self.failed_slurps_table.scan():
                batch.delete_item(searchterm=item['searchterm'])

    def truncate_slurp_table(self):
        """
        WARNING! Only use for test mode table
        """
        assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually"
        test_slurps_table = Table('test_slurps', connection=self.connection)
        with test_slurps_table.batch_write() as batch:
            for item in self.slurps_table.scan():
                batch.delete_item(searchterm=item['searchterm'])

    def modify_failed_slurps_throughput(self, requested_read, requested_write):
        return self.modify_throughput(requested_read, requested_write, self.failed_slurps_table)

    def modify_slurps_throughput(self, requested_read, requested_write):
        return self.modify_throughput(requested_read, requested_write, self.slurps_table)

    def get_slurps_table_info(self):
        return self.get_table_info(self.slurps_table)

    def get_failed_slurps_table_info(self):
        return self.get_table_info(self.failed_slurps_table)