Example #1
0
def main_loop(s3, p, tmp_sql_file, table_name):
    global global_table_list
    global global_file_list
    task = config.task
    base_key_prefix = task['base_key_prefix']
    save_prefix = task['save_prefix']

    save_file_list = download_from_s3(s3, base_key_prefix, p, save_prefix)
    if not save_file_list or not isinstance(save_file_list, (list, tuple)):
        logging.error("[FAIL]main_loop():[%s]" % save_file_list)
        return None
    global_file_list += save_file_list
    logging.info("save_file_list:%s" % save_file_list)
    field_list = ['hash', 'event_type', 'arrival_time', 'client_id',
                  'uid', 'organic', 'lang', 'package', 'referrer',
                  'country', 'ip_country', 'sim_country', 'client_time']

    for save_file in save_file_list:
        event_list = parse_json(save_file)
        if not event_list or not event_list:
            logging.error("main_loop():parse_json:event_list:[%s]" % event_list)
        sql_list = []
        for m in event_list:
            # create table if not exits
            the_time = m.get('client_time')
            if common.is_float(the_time):
                the_time = float(the_time)
            else:
                the_time = m.get('arrival_time')
            if common.is_float(the_time):
                the_time = float(the_time)
            else:
                the_time = time.time()
            new_table_name = get_table_name(table_name, the_time)
            if new_table_name not in global_table_list:
                global_table_list.append(new_table_name)
                create_table(tmp_sql_file, new_table_name)

            sql = common.mysql_gen_save_sql(new_table_name, m, field_list, field_list)
            sql.encode('utf-8')
            sql_list.append(sql)
        with open(tmp_sql_file, 'a')as f:
            big_sql = '\n'.join(sql_list)
            f.write(big_sql)
Example #2
0
def replace_nan_or_string_values_with_max_in_dictionary(
        dictionary, reverse=False):
    non_numeric_keys = []
    numeric_values = []
    for key, value in dictionary.iteritems():
        if is_float(value):
            if math.isnan(value):
                non_numeric_keys.append(key)
            else:
                numeric_values.append(value)
        else:
            non_numeric_keys.append(key)

    non_numeric_values_replacer = 0
    if reverse:
        non_numeric_values_replacer = min(numeric_values) - 1
    else:
        non_numeric_values_replacer = max(numeric_values) + 1

    for non_numeric_key in non_numeric_keys:
        dictionary[non_numeric_key] = non_numeric_values_replacer

    return dictionary
Example #3
0
    def _read_csv(self, key_column_name):
        path, file = os.path.split(self.path_to_csv)
        print 'Reading file "' + file + '"...'
        input_file = open(self.path_to_csv, 'rt')
        reader = csv.reader(input_file, delimiter=self.delimiter)

        self.headers = next(reader)
        self.column_num = len(self.headers)
        self.row_num = len(open(self.path_to_csv).readlines())

        if self.auto_detect_types:
            int_columns = [True] * len(self.headers)
            float_columns = [True] * len(self.headers)

        line_number = 0
        bar = progressbar.ProgressBar(maxval=1.0).start()
        for row in reader:

            for x in range(0, self.column_num):
                if self.auto_detect_types:
                    if int_columns[x]:
                        int_columns[x] = is_int(row[x])
                    if float_columns[x]:
                        float_columns[x] = is_float(row[x])

            row_dict = self.__row_to_dict(row)
            self.data.append(row_dict)

            if key_column_name:
                self.data_dict[row_dict[key_column_name]] = row_dict

            line_number += 1
            bar.update((line_number + 0.0) / self.row_num)
        bar.finish()

        if self.auto_detect_types:
            self.__format_data(int_columns, float_columns)
Example #4
0
def parse_label(label):
    """Parse a Newick label which may contain a support value, taxon, and/or auxiliary information.

    Parameters
    ----------
    label : str
        Internal label in a Newick tree.

    Returns
    -------
    float
        Support value specified by label, or None
    str
        Taxon specified by label, or None
    str
        Auxiliary information, on None
    """

    support = None
    taxon = None
    auxiliary_info = None

    if label:
        label = label.strip()
        if '|' in label:
            label, auxiliary_info = label.split('|')

        if ':' in label:
            support, taxon = label.split(':')
            support = float(support)
        else:
            if is_float(label):
                support = float(label)
            elif label != '':
                taxon = label

    return support, taxon, auxiliary_info
Example #5
0
def parse_json(json_file):
    logging.info("parse_json():%s" % json_file)
    event_list = []
    _default_value = {
        'event_type': '',
        'client_time': '1000',
        'arrival_time': '1000',
        'client_id': '',
        'uid': '',
        'country': '',
        'ip_country': '',
        'sim_country': '',
        'lang': '',
        'referrer': '',
        # 'ClientActiveTime': '',
        'package': '',
        'organic': -1,
        'hash': ''
    }
    # print _default_value.keys()
    try:
        with gzip.open(json_file, 'rb') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                m = _default_value.copy()
                m['hash'] = hashlib.md5(line).hexdigest()
                # print "md5:", hashlib.md5(line).hexdigest()
                # print "sha1sum:", hashlib.sha1(line).hexdigest()
                j = json.loads(line)
                m['event_type'] = j.get('event_type', '')
                m['client_time'] = j.get('event_timestamp', 1000)
                m['arrival_time'] = j.get('arrival_timestamp', 1000)
                application = j.get('application')
                if application:
                    m['package'] = application.get('package_name', '')
                client = j.get('client')
                if client:
                    m['client_id'] = client.get('client_id', '')
                device = j.get('device')
                if device:
                    locale = device.get('locale')
                    if locale:
                        m['country'] = locale.get('country', '')
                        m['lang'] = locale.get('language', '')
                attributes = j.get('attributes')
                if attributes:
                    m['uid'] = attributes.get('DeviceId', '')
                    m['referrer'] = attributes.get('Referrer', '')
                    m['ip_country'] = attributes.get('country', '')
                    m['sim_country'] = attributes.get('simCountry', '')
                    IsOrganic = attributes.get('IsOrganic', '')
                    if IsOrganic == 'true':
                        m['organic'] = 1
                    elif IsOrganic == 'false':
                        m['organic'] = 0
                    else:
                        m['organic'] = -1
                        # m['event_type'] = attributes.get('ClientActiveTime', 1000)
                for param in ['client_time', 'arrival_time']:
                    val = m[param]
                    if common.is_float(val):
                        val = float(val) / 1000
                    else:
                        val = 1
                    m[param] = common.format_utc_time(val)
                event_list.append(m)
    except Exception as e:
        log.err("parse_json:Exception:%s" % e)
    finally:
        return event_list
Example #6
0
    def read_from_tree(self, tree, warnings=True):
        """Obtain the taxonomy for each extant taxa as specified by internal tree labels.

        Parameters
        ----------
        tree : str or dendropy.Tree
            Filename of newick tree or dendropy tree object.

        Returns
        -------
        dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>]
            Taxa indexed by unique ids.
        """

        if isinstance(tree, basestring):
            tree = dendropy.Tree.get_from_path(tree,
                                               schema='newick',
                                               rooting="force-rooted",
                                               preserve_underscores=True)

        taxonomy = {}
        for leaf in tree.leaf_node_iter():
            taxa = []

            node = leaf.parent_node
            while node:
                if node.label:
                    taxa_str = node.label
                    if ':' in taxa_str:
                        taxa_str = taxa_str.split(':')[1]

                    if not is_float(taxa_str):
                        if taxa_str[-1] == ';':
                            taxa_str = taxa_str[:-1]

                        # check for concatenated ranks of the form:
                        # p__Crenarchaeota__c__Thermoprotei
                        for prefix in Taxonomy.rank_prefixes:
                            split_str = '__' + prefix
                            if split_str in taxa_str:
                                taxa_str = taxa_str.replace(
                                    split_str, ';' + prefix)

                        # appears to be an internal label and not simply a
                        # support value
                        taxa = [x.strip() for x in taxa_str.split(';')] + taxa
                node = node.parent_node

            if warnings and len(taxa) > 7:
                self.logger.warning(
                    'Invalid taxonomy string read from tree for taxon %s: %s' %
                    (leaf.taxon.label, taxa))
                # sys.exit(-1)

            # check if genus name should be appended to species label
            if len(taxa) == 7:
                genus = taxa[5][3:]
                species = taxa[6][3:]
                if genus not in species:
                    taxa[6] = 's__' + genus + ' ' + species

            taxa = self.fill_trailing_ranks(taxa)
            taxonomy[leaf.taxon.label] = taxa

        return taxonomy