Exemple #1
0
def mail_table(domain_name):
    if domain_name.startswith(mail_table_prefix):
        print 'mail_table:', domain_name
        mt = Table(domain_name)
        print 'mail_table count:', mt.count()
        return mt
    else:
        return mail_table(mail_table_prefix + domain_name)
Exemple #2
0
def count_table(name):
    # input must be string, 6 hours delayed
    try:
        table = Table(name, connection=client_dynamo)
        count = table.count()
        return count
    except KeyboardInterrupt:
        exit
    def _get_table(self, table_name, create=True):
        """ Get a DynamoDB table reference based on a table name.

        If we have looked up/created this table before, use the cached
        reference. If not, check to make sure the table exists. If it does
        not exist, create it and return that reference.

        Note that if this function does not find the table, it will create it
        and this creation operation can block for some time (typically ~10s).
        It will only return the table reference once the table is active and
        ready to be stored to or read from.

        As a result, it probably makes sense to call this method in a lock
        for the specific table. Otherwise, simultaneous calls to _get_table
        could result in multiple table creations.

        Args:
            create (bool): If table does not exist, create it.

        Returns:
            table: A boto table reference

        Raises:
            Exception: When table does not exist and `create` is False.
        """
        if table_name in self._table_cache:
            return self._table_cache[table_name]

        table = Table(table_name, connection=self._conn)

        try:
            num_items = table.count()
            self.logger.debug("Table {} found - contains {} items".format(
                table_name, num_items))
        except JSONResponseError as jre:
            if create and 'ResourceNotFoundException' in str(jre):
                # If we get a resource not found exception, the table must not
                # exist, so let's create it
                self.logger.info(
                    "Table {} not found - creating it".format(table_name))
                table = self._create_table(table_name)
                self.logger.debug("Table created: {}".format(table))
            else:
                # We got some other type of exception, raise it since that
                # wasn't expected
                raise
        except:
            self.logger.exception("Unable to determine table reference")
            raise

        # Cache this reference to the table for later use
        self._table_cache[table_name] = table
        return table
Exemple #4
0
 def get_mail_table(self, domain):
     mail_table = 'smtp'
     s3_mail_table = Table(mail_table)
     try:
         print mail_table, 'count:', s3_mail_table.count()
     except:
         print 'creating:', mail_table
         s3_mail_table = Table.create(
             mail_table,
             schema=[HashKey('derived_to'),
                     RangeKey('derived_from')],
             throughput={
                 'read': 3,
                 'write': 3
             })
     return s3_mail_table
except:
	print "keyword does not exist in this source. Try another one"
	exit()

print "num doc with keyword" +  str(numDocsWithKeyword)
	
#now get a table with documents
#"DailyBeast"
tablenameWithDocs = sys.argv[3]

tableWithDocs = Table(tablenameWithDocs,connection=db)

#get count of number of documents
#we know that this number gets updated only once every 6 hours. This would need to be modified if we are going to work with system in which articles are being added to the database 
#constantly
lengthOfCorpus = tableWithDocs.count()

#tableWithDocs.query_count(last_name__eq='Doe')

#rows = tableWithDocs.scan(body__contains= 'obama')
#index = 0
#for row in rows:
#	index +=1
#print index
#exit()

tdIdfCalculator = TfIdf.TfIdf(lengthOfCorpus,numDocsWithKeyword,keyword)

columnWithBody = sys.argv[4]

columnWithUniqueId = sys.argv[5]
Exemple #6
0
except:
    print "keyword does not exist in this source. Try another one"
    exit()

print "num doc with keyword" + str(numDocsWithKeyword)

#now get a table with documents
#"DailyBeast"
tablenameWithDocs = sys.argv[3]

tableWithDocs = Table(tablenameWithDocs, connection=db)

#get count of number of documents
#we know that this number gets updated only once every 6 hours. This would need to be modified if we are going to work with system in which articles are being added to the database
#constantly
lengthOfCorpus = tableWithDocs.count()

#tableWithDocs.query_count(last_name__eq='Doe')

#rows = tableWithDocs.scan(body__contains= 'obama')
#index = 0
#for row in rows:
#	index +=1
#print index
#exit()

tdIdfCalculator = TfIdf.TfIdf(lengthOfCorpus, numDocsWithKeyword, keyword)

columnWithBody = sys.argv[4]

columnWithUniqueId = sys.argv[5]