コード例 #1
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        output_names = dat.get_output_names(opts.data_files[0],
                                            regex=opts.output_names)
        stats = OrderedDict()
        for name in output_names:
            output = hdf.read(opts.data_files,
                              'outputs/%s' % name,
                              nb_sample=opts.nb_sample)
            output = list(output.values())[0]
            stats[name] = get_output_stats(output)
        tmp = []
        for key, value in six.iteritems(stats):
            tmp.append(pd.DataFrame(value, index=[key]))
        stats = pd.concat(tmp)
        stats.index.name = 'output'
        stats.reset_index(inplace=True)

        print(stats.to_string())
        if opts.out_tsv:
            stats.to_csv(opts.out_tsv, sep='\t', index=False)

        if opts.out_fig:
            plot_stats(stats).savefig(opts.out_fig)

        return 0
コード例 #2
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        output_names = dat.get_output_names(opts.data_files[0],
                                            regex=opts.output_names)
        stats = OrderedDict()
        for name in output_names:
            output = hdf.read(opts.data_files, 'outputs/%s' % name,
                              nb_sample=opts.nb_sample)
            output = list(output.values())[0]
            stats[name] = get_output_stats(output)
        tmp = []
        for key, value in six.iteritems(stats):
            tmp.append(pd.DataFrame(value, index=[key]))
        stats = pd.concat(tmp)
        stats.index.name = 'output'
        stats.reset_index(inplace=True)

        print(stats.to_string())
        if opts.out_tsv:
            stats.to_csv(opts.out_tsv, sep='\t', index=False)

        if opts.out_fig:
            plot_stats(stats).savefig(opts.out_fig)

        return 0
コード例 #3
0
ファイル: aggregate_plots.py プロジェクト: Xiuying/illumitag
class FractionTaxaBarStack(Graph):
    """Comparing all fractions across all pools in a barstack"""
    short_name = 'fraction_taxa_barstack'

    def plot(self):
        self.frame = OrderedDict((('%s - %s' % (p,f), getattr(p.fractions, f).rdp.phyla)
                     for f in ('low', 'med', 'big') for p in self.parent.pools))
        self.frame = pandas.DataFrame(self.frame)
        self.frame = self.frame.fillna(0)
        self.frame = self.frame.transpose()
        self.frame = self.frame.apply(lambda x: 100*x/x.sum(), axis=1)
        # Sort the table by sum #
        sums = self.frame.sum()
        sums.sort(ascending=False)
        self.frame = self.frame.reindex_axis(sums.keys(), axis=1)
        # Plot #
        fig = pyplot.figure()
        axes = self.frame.plot(kind='bar', stacked=True, color=cool_colors)
        fig = pyplot.gcf()
        # Other #
        axes.set_title('Species relative abundances per fraction per pool')
        axes.set_ylabel('Relative abundances in percent')
        axes.xaxis.grid(False)
        axes.yaxis.grid(False)
        axes.set_ylim([0,100])
        # Put a legend below current axis
        axes.legend(loc='upper center', bbox_to_anchor=(0.5, -0.20), fancybox=True, shadow=True, ncol=5)
        # Save it #
        self.save_plot(fig, axes, width=24.0, height=14.0, bottom=0.30, top=0.97, left=0.04, right=0.98)
        self.frame.to_csv(self.csv_path)
        pyplot.close(fig)
コード例 #4
0
 def save_results(self):
     results_dict = OrderedDict()
     results_dict['Date'] = [datetime.datetime.now().strftime('%a %d %b')]
     results_dict['Correct'] = [len(self.correct)]
     results_dict['Answered'] = [len(self.correct) + len(self.wrong)]
     results_dict['Wrong'] = [sorted(self.wrong)]
     results_dict['Topic'] = [self.topic]
     results_df = pd.DataFrame(data=results_dict)
     try:
         results_dict = pd.read_csv(f'results/{self.course}.csv', sep='\t')
         results_dict = results_dict.append(results_df)
         results_dict.to_csv(f'results/{self.course}.csv',
                             sep='\t',
                             index=False)
     except:
         results_df.to_csv(f'results/{self.course}.csv',
                           sep='\t',
                           index=False)
コード例 #5
0
class FractionTaxaBarStack(Graph):
    """Comparing all fractions across all pools in a barstack"""
    short_name = 'fraction_taxa_barstack'

    def plot(self):
        self.frame = OrderedDict(
            (('%s - %s' % (p, f), getattr(p.fractions, f).rdp.phyla)
             for f in ('low', 'med', 'big') for p in self.parent.pools))
        self.frame = pandas.DataFrame(self.frame)
        self.frame = self.frame.fillna(0)
        self.frame = self.frame.transpose()
        self.frame = self.frame.apply(lambda x: 100 * x / x.sum(), axis=1)
        # Sort the table by sum #
        sums = self.frame.sum()
        sums.sort(ascending=False)
        self.frame = self.frame.reindex_axis(sums.keys(), axis=1)
        # Plot #
        fig = pyplot.figure()
        axes = self.frame.plot(kind='bar', stacked=True, color=cool_colors)
        fig = pyplot.gcf()
        # Other #
        axes.set_title('Species relative abundances per fraction per pool')
        axes.set_ylabel('Relative abundances in percent')
        axes.xaxis.grid(False)
        axes.yaxis.grid(False)
        axes.set_ylim([0, 100])
        # Put a legend below current axis
        axes.legend(loc='upper center',
                    bbox_to_anchor=(0.5, -0.20),
                    fancybox=True,
                    shadow=True,
                    ncol=5)
        # Save it #
        self.save_plot(fig,
                       axes,
                       width=24.0,
                       height=14.0,
                       bottom=0.30,
                       top=0.97,
                       left=0.04,
                       right=0.98)
        self.frame.to_csv(self.csv_path)
        pyplot.close(fig)
コード例 #6
0
ファイル: GFF.py プロジェクト: mahajrod/RouToolPa
class CollectionGFF(Parser):
    def __init__(self,
                 in_file=None,
                 records=None,
                 format="gff",
                 parsing_mode="only_coordinates",
                 black_list=(),
                 white_list=(),
                 featuretype_separation=False,
                 scaffold_syn_dict=None):
        """
        IMPORTANT: coordinates are converted to 0-based

        :param in_file:
        :param records:
        :param format:
        :param parsing_mode:
        :param black_list:
        :param white_list:
        :param featuretype_separation:
        :param scaffold_syn_dict:

        """
        self.formats = ["gff", "gtf", "bed"]
        self.GFF_COLS = AnnotationFormats.GFF_COLS
        self.BED_COLS = AnnotationFormats.BED_COLS
        self.parsing_parameters = {
            "gff": {
                "only_coordinates": {
                    "col_names": ["scaffold", "start", "end"],
                    "cols": [0, 3, 4],
                    "index_cols": "scaffold",
                    "converters": {
                        "scaffold": str,
                        "start": lambda x: np.int32(x) - 1,
                        "end": np.int32,
                    },
                    "col_name_indexes": {
                        "scaffold": 0,
                        "start": 1,
                        "end": 2
                    },
                },
                "coordinates_and_type": {
                    "col_names": ["scaffold", "featuretype", "start", "end"],
                    "cols": [0, 2, 3, 4],
                    "index_cols": ["scaffold"],
                    "converters": {
                        "scaffold": str,
                        "featuretype": str,
                        "start": lambda x: np.int32(x) - 1,
                        "end": np.int32,
                    },
                    "col_name_indexes": {
                        "scaffold": 0,
                        "featuretype": 1,
                        "start": 2,
                        "end": 3
                    },
                },
                "coord_and_attr": {
                    "col_names": [
                        "scaffold", "featuretype", "start", "end", "strand",
                        "attributes"
                    ],
                    "cols": [0, 2, 3, 4, 6, 8],
                    "index_cols": ["scaffold", "featuretype"],
                    "converters": {
                        "scaffold": str,
                        "featuretype": str,
                        "start": lambda x: np.int32(x) - 1,
                        "end": np.int32,
                        "strand": str,
                        "attributes": str,
                    },
                    "col_name_indexes": {
                        "scaffold": 0,
                        "featuretype": 1,
                        "start": 2,
                        "end": 3,
                        "strand": 4,
                        "attributes": 5,
                    },
                },
                "all": {
                    "col_names": [
                        "scaffold", "source", "featuretype", "start", "end",
                        "score", "strand", "phase", "attributes"
                    ],
                    "cols": [0, 1, 2, 3, 4, 5, 6, 7, 8],
                    "index_cols": ["scaffold"],
                    "converters": {
                        "scaffold": str,
                        "source": str,
                        "featuretype": str,
                        "start": lambda x: np.int32(x) - 1,
                        "end": np.int32,
                        "score": str,
                        "strand": str,
                        "phase": str,
                        "attributes": str,
                    },
                    "col_name_indexes": {
                        "scaffold": 0,
                        "source": 1,
                        "featuretype": 2,
                        "start": 3,
                        "end": 4,
                        "score": 5,
                        "strand": 6,
                        "phase": 7,
                        "attributes": 8
                    },
                },
                "complete": {
                    "col_names": [
                        "scaffold", "source", "featuretype", "start", "end",
                        "score", "strand", "phase", "attributes"
                    ],
                    "cols": [0, 1, 2, 3, 4, 5, 6, 7, 8],
                    "index_cols": ["scaffold"],
                    "converters": {
                        "scaffold": str,
                        "source": str,
                        "featuretype": str,
                        "start": lambda x: np.int32(x) - 1,
                        "end": np.int32,
                        "score": str,
                        "strand": str,
                        "phase": str,
                        "attributes": str,
                    },
                    "col_name_indexes": {
                        "scaffold": 0,
                        "source": 1,
                        "featuretype": 2,
                        "start": 3,
                        "end": 4,
                        "score": 5,
                        "strand": 6,
                        "phase": 7,
                        "attributes": 8
                    },
                },
            },
            "bed": {
                "only_coordinates": {
                    "col_names": ["scaffold", "start", "end"],
                    "cols": [0, 1, 2],
                    "index_cols": "scaffold",
                    "converters": {
                        "scaffold": str,
                        "start": np.int32,
                        "end": np.int32,
                    },
                    "col_name_indexes": {
                        "scaffold": 0,
                        "start": 1,
                        "end": 2
                    },
                },
            }
        }
        self.parsing_mode = parsing_mode
        self.featuretype_separation = featuretype_separation
        self.featuretype_parsing_modes = [
            "coordinates_and_type", "all", "coord_and_attr", "complete"
        ]
        self.attributes_parsing_modes = ["complete", "coord_and_attr"]
        self.format = format
        self.black_list = black_list
        self.white_list = white_list

        self.featuretype_list = []
        self.scaffold_syn_dict = scaffold_syn_dict

        # attributes type conversion parameters
        self.parameter_separator_dict = OrderedDict()
        self.default_replace_dict = OrderedDict({".": None})
        self.converters = OrderedDict()
        self.pandas_int_type_correspondence = OrderedDict({
            "Int8": np.float16,
            "Int16": np.float16,
            "Int32": np.float32,
            "Int64": np.float64,
        })

        # init aliases
        self.record_id_col = self.parsing_parameters[self.format][
            self.parsing_mode]["col_name_indexes"]["scaffold"]
        self.record_start_col = self.parsing_parameters[self.format][
            self.parsing_mode]["col_name_indexes"]["start"]
        self.record_end_col = self.parsing_parameters[self.format][
            self.parsing_mode]["col_name_indexes"]["end"]

        self.col_names = self.parsing_parameters[self.format][
            self.parsing_mode]["col_names"]
        self.index_cols = self.parsing_parameters[self.format][
            self.parsing_mode]["index_cols"]

        # load records
        self.featuretype_list = None
        if in_file:
            self.read(in_file,
                      format=format,
                      parsing_mode=parsing_mode,
                      black_list=black_list,
                      white_list=white_list,
                      featuretype_separation=featuretype_separation)

        else:
            self.records = records

        if featuretype_separation and (self.parsing_mode
                                       in self.featuretype_parsing_modes):
            self.scaffold_dict = OrderedDict([
                (featuretype, self.records[featuretype].index.get_level_values(
                    'scaffold').unique().to_list())
                for featuretype in self.featuretype_list
            ])
            self.scaffold_list = list(self.scaffold_dict.values())
        else:
            self.scaffold_list = self.records.index.get_level_values(
                'scaffold').unique().to_list()
            self.scaffold_dict = None

    def read(self,
             in_file,
             format="gff",
             parsing_mode="only_coordinates",
             featuretype_separation=False,
             sort=False,
             black_list=(),
             white_list=()):
        if format not in self.parsing_parameters:
            raise ValueError(
                "ERROR!!! This format(%s) was not implemented yet for parsing!"
                % parsing_mode)
        elif parsing_mode not in self.parsing_parameters[format]:
            raise ValueError(
                "ERROR!!! This format(%s) was not implemented yet for parsing in this mode(%s)!"
                % (format, parsing_mode))

        print("%s\tReading input..." % str(datetime.datetime.now()))
        self.records = pd.read_csv(
            in_file,
            sep='\t',
            header=None,
            na_values=".",
            comment="#",
            usecols=self.parsing_parameters[format][parsing_mode]["cols"],
            converters=self.parsing_parameters[format][parsing_mode]
            ["converters"],
            names=self.parsing_parameters[format][parsing_mode]["col_names"],
            index_col=self.parsing_parameters[format][parsing_mode]
            ["index_cols"])

        if white_list or black_list:
            scaffolds_to_keep = self.get_filtered_entry_list(
                self.records.index,
                entry_black_list=black_list,
                entry_white_list=white_list)
            self.records = self.records[self.records.index.get_level_values(
                'scaffold').isin(scaffolds_to_keep)]

        if self.scaffold_syn_dict:
            self.records["scaffold"].replace(self.scaffold_syn_dict,
                                             inplace=True)

        self.records.index = pd.MultiIndex.from_arrays(
            [self.records.index,
             np.arange(0, len(self.records))],
            names=("scaffold", "row"))
        print("%s\tReading input finished..." % str(datetime.datetime.now()))

        if parsing_mode in self.featuretype_parsing_modes:
            self.featuretype_list = list(self.records[["featuretype"
                                                       ]].iloc[:, 0].unique())
            if featuretype_separation:
                self.records = OrderedDict([
                    (featuretype,
                     self.records[self.records["featuretype"] == featuretype])
                    for featuretype in self.featuretype_list
                ])

        if parsing_mode in self.attributes_parsing_modes:
            retained_columns = deepcopy(self.parsing_parameters[self.format][
                self.parsing_mode]["col_names"])
            for entry in "attributes", "scaffold":
                retained_columns.remove(entry)

            if featuretype_separation and (parsing_mode
                                           in self.featuretype_parsing_modes):
                attributes_dict = self.parse_attributes()
                for featuretype in self.featuretype_list:
                    #self.records[featuretype].columns = pd.MultiIndex.from_arrays([
                    #                                                               self.records[featuretype].columns,
                    #                                                               self.records[featuretype].columns,
                    #                                                               ])
                    self.records[featuretype] = pd.concat([
                        self.records[featuretype][retained_columns],
                        attributes_dict[featuretype]
                    ],
                                                          axis=1)
            else:
                attributes = self.parse_attributes()
                #self.records.columns = pd.MultiIndex.from_arrays([
                #                                                  self.records.columns,
                #                                                  self.records.columns,
                #                                                  ])

                self.records = pd.concat(
                    [self.records[retained_columns], attributes], axis=1)
        if sort:
            self.records.sort_values(by=["scaffold", "start", "end"])

    def parse_column(self, column, param):
        #col.replace(self.default_replace_dict, inplace=True)
        if param not in self.converters:
            return column
        elif self.converters[param] == str:
            return column
        if self.converters[param] in self.pandas_int_type_correspondence:
            col = column.apply(self.pandas_int_type_correspondence[
                self.converters[param]]).astype(self.converters[param])
        else:
            col = column.apply(self.converters[param])

        return col

    def parse_attributes(self):
        print("%s\tParsing attribute field..." % str(datetime.datetime.now()))
        if isinstance(self.records, (OrderedDict, dict)):
            tmp_attr_dict = OrderedDict()
            for entry in self.records:
                tmp_attr = map(
                    lambda s: OrderedDict(
                        map(lambda b: b.split("="), s.split(";"))),
                    list(self.records[entry]["attributes"]))
                tmp_attr = pd.DataFrame(tmp_attr)

                shape = np.shape(tmp_attr)
                column_number = 1 if len(shape) == 1 else shape[1]

                #tmp_attr.columns = pd.MultiIndex.from_arrays([
                #                                              ["attributes"] * column_number,
                #                                              tmp_attr.columns
                #                                             ])

                tmp_attr.index = self.records[entry].index
                tmp_attr_dict[entry] = tmp_attr
            print("%s\tParsing attribute field finished..." %
                  str(datetime.datetime.now()))
            return tmp_attr_dict

        elif isinstance(self.records, (pd.DataFrame, )):

            tmp_attr = map(
                lambda s: OrderedDict(map(lambda b: b.split("="), s.split(";"))
                                      ), list(self.records["attributes"]))
            tmp_attr = pd.DataFrame(tmp_attr)

            shape = np.shape(tmp_attr)
            column_number = 1 if len(shape) == 1 else shape[1]

            #tmp_attr.columns = pd.MultiIndex.from_arrays([
            #                                              ["attributes"] * column_number,
            #                                              tmp_attr.columns
            #                                             ])
            tmp_attr.index = self.records.index
            print("%s\tParsing attribute field finished..." %
                  str(datetime.datetime.now()))
            return tmp_attr
        else:
            raise ValueError("ERROR!!! Unknown format of the records!")

    def get_attribute_names(self):
        if self.featuretype_separation:
            attributes_dict = OrderedDict()
            for feature in self.records:
                attributes_dict[feature] = list(self.records[feature][
                    AnnotationFormats.GFF_COLS["attributes"] -
                    1:])  # -1 is necessary as scaffold column is part of index

            return attributes_dict

        return list(self.records[AnnotationFormats.GFF_COLS["attributes"] - 1:]
                    )  # -1 is necessary as scaffold column is part of index

    def total_length(self):
        return np.sum(self.records['end'] - self.records['start'])

    #def get_feature_length(self, output=None, featuretype_list=None):
    #
    #    feature_records = self.records[self.records["featuretype"].isin(featuretype_list)] if featuretype_list else self.records

    def collapse_records(self, sort=True, verbose=True):
        """
        strand-independent collapse
        :param sort:
        :param verbose:
        :return:
        """
        if self.featuretype_separation:
            raise ValueError(
                "ERROR!!! Record collapse for parsing with feature separation was not implemented yet!"
            )
        else:
            records_before_collapse = len(self.records)

            if sort:
                self.records.sort_values(by=["scaffold", "start", "end"])
            row_list = []
            for scaffold in self.scaffold_list:
                #print scaffold
                # check if there is only one record per scaffold, necessary as pandas will return interger instead of Series
                if len(self.records.loc[[scaffold]]) == 1:
                    for row in self.records.loc[[scaffold
                                                 ]].itertuples(index=True):
                        row_list.append(list(row))
                    continue
                #print self.records.loc[scaffold]
                # remove nested records
                end_diff = self.records.loc[[scaffold]]['end'].diff()
                #print len(end_diff)
                end_diff[0] = 1
                no_nested_records_df = self.records.loc[[scaffold
                                                         ]][end_diff > 0]
                #print len(no_nested_records_df)
                # collapse overlapping records

                row_iterator = no_nested_records_df.itertuples(index=True)

                prev_row = list(row_iterator.readline())

                for row in row_iterator:
                    row_l = list(row)
                    if row_l[self.record_start_col] > prev_row[
                            self.record_end_col]:
                        row_list.append(prev_row)
                        prev_row = row_l
                    else:
                        prev_row[self.record_end_col] = row_l[
                            self.record_end_col]

                row_list.append(prev_row)
            self.records = pd.DataFrame.from_records(row_list,
                                                     columns=self.col_names,
                                                     index=self.index_cols)

            if verbose:
                print(
                    "Records before collapsing: %i\nRecords after collapsing: %i"
                    % (records_before_collapse, len(self.records)))

    def remove_small_records(self, min_record_length):
        if self.featuretype_separation:
            raise ValueError(
                "ERROR!!! Removal of small records for parsing with feature separation "
                "was not implemented yet!")
        else:
            records_before_collapse = len(self.records)
            self.records = self.records[(
                self.records['end'] -
                self.records['start']) >= min_record_length]
            print("Records before filtering: %i\nRecords afterfiltering: %i" %
                  (records_before_collapse, len(self.records)))

    def __add__(self, other):
        new_gff_record = CollectionGFF(records=pd.concat(
            [self.records, other.records]),
                                       in_file=None,
                                       format=self.format,
                                       parsing_mode=self.parsing_mode,
                                       black_list=self.black_list,
                                       white_list=self.white_list)
        new_gff_record.records = new_gff_record.records.sort_values(
            by=["scaffold", "start", "end"])

        return new_gff_record

    def __radd__(self, other):
        new_gff_record = CollectionGFF(records=pd.concat(
            [other.records, self.records]),
                                       in_file=None,
                                       format=other.format,
                                       parsing_mode=other.parsing_mode,
                                       black_list=other.black_list,
                                       white_list=other.white_list)
        new_gff_record.records = new_gff_record.records.sort_values(
            by=["scaffold", "start", "end"])

        return new_gff_record

    def sequence_generator(self,
                           records,
                           sequence_collection,
                           expression=None):
        for entry in records.itertuples():
            if expression:
                if not expression(entry):
                    continue
            yield entry[self.record_id_col], sequence_collection[entry[
                self.record_id_col]][
                    entry[self.record_start_col]:entry[self.record_end_col]]

    def get_introns(self,
                    exon_feature="CDS",
                    parent_id_field="Parent",
                    id_field="ID",
                    intron_id_prefix="intron",
                    intron_id_digit_number=8):
        # TODO: CORRECT ERRORS FOR INTRONS IN - STRAND IN CASE WHEN EXONS ARE REVERSE SORTED BY COORDINATE
        if self.featuretype_separation:
            intron_index = 1
            intron_id_template = "%s%%0%ii" % (intron_id_prefix,
                                               intron_id_digit_number)

            self.records["intron"] = self.records[exon_feature].copy(deep=True)
            self.records["intron"]["start"], self.records["intron"]["end"] = self.records["intron"]["end"], \
                                                                             self.records["intron"]["start"].shift(periods=-1, fill_value=0)
            self.records["intron"].index = self.records[
                "intron"].index.droplevel(level=1)

            self.records["intron"]["row"] = range(0,
                                                  len(self.records["intron"]))
            self.records["intron"].set_index("row", append=True, inplace=True)

            self.records["intron"].drop(self.records["intron"].groupby(
                parent_id_field, sort=False).agg(
                    {parent_id_field: 'count'})[parent_id_field].cumsum() - 1,
                                        level=1,
                                        inplace=True)

            intron_number = len(self.records["intron"])

            self.records["intron"]["phase"] = 0
            self.records["intron"]["featuretype"] = "intron"

            self.records["intron"][id_field] = [
                intron_id_template % i
                for i in range(intron_index, intron_index + intron_number)
            ]

            self.records["intron"].index = self.records[
                "intron"].index.droplevel(level=1)
            self.records["intron"]["row"] = range(0,
                                                  len(self.records["intron"]))
            self.records["intron"].set_index("row", append=True, inplace=True)

    def write_introns(self, output):
        if "intron" in self.records:
            with open(output, "w") as out_fd:
                for row_tuple in self.records["intron"].copy(
                        deep=True).reset_index(level="scaffold").itertuples(
                            index=False):
                    out_fd.write(
                        "%s\t%i\t%i\t%s\t%i\t%s\n" %
                        ("\t".join(row_tuple[:3]), row_tuple[3] + 1,
                         row_tuple[4], "\t".join(
                             row_tuple[5:7]), row_tuple[7], ";".join([
                                 "%s=%s" % (self.records["intron"].columns[i],
                                            str(row_tuple[i]))
                                 for i in range(
                                     8, len(self.records["intron"].columns))
                             ])))
        else:
            raise ValueError("ERROR!!! No introns were found!")

    def write(self,
              output,
              output_format,
              source="custom",
              feature_type="region"):

        if self.format == "bed":
            if self.parsing_mode == "only_coordinates":
                if output_format == "bed":
                    self.records.to_csv(output,
                                        sep="\t",
                                        index=True,
                                        header=False)
                elif output_format == "gff":
                    entry_template = "%s\t%s\t%s\t%i\t%i\t.\t.\t.\t.\n"
                    with open(output, "w") as out_fd:
                        for record_tuple in self.records.reset_index(
                                level=0).itertuples(index=False):
                            out_fd.write(
                                entry_template %
                                (record_tuple[0], source, feature_type,
                                 record_tuple[1] + 1, record_tuple[2]))

    def extract_sequences_by_type(self,
                                  sequence_collection,
                                  record_type_black_list=[],
                                  record_type_white_list=[],
                                  return_type="collection",
                                  records_parsing_type="parse"):

        if self.parsing_mode in self.featuretype_parsing_modes:
            if return_type == "collection":
                selected_records = self.records[
                    self.records.index.isin(record_type_white_list, level=1) &
                    (~self.records.index.isin(record_type_black_list, level=1)
                     )]

                from RouToolPa.Parsers.Sequence import CollectionSequence

                extracted_records = CollectionSequence()

        else:
            pass
コード例 #7
0
        tx_state_code.append(seseds["StateCode"][i])
        tx_year.append(seseds["Year"][i])
        tx_data.append(seseds["Data"][i])
az_comp_data = OrderedDict()
ca_comp_data = OrderedDict()
nm_comp_data = OrderedDict()
tx_comp_data = OrderedDict()
item_dict = OrderedDict()
item_dict["MSN"] = az_msn
item_dict["StateCode"] = az_state_code
item_dict["Year"] = az_year
item_dict["Data"] = az_data
az_comp_data = pd.DataFrame(item_dict)
az_comp_data.to_csv(
    "C:\\Users\\THINKPAD\\PycharmProjects\\MCM-ICM-2018-Problem-C\\code\\PCR\\az_data.csv",
    index=False,
    index_label=False,
    sep=',')
item_dict["MSN"] = ca_msn
item_dict["StateCode"] = ca_state_code
item_dict["Year"] = ca_year
item_dict["Data"] = ca_data
ca_comp_data = pd.DataFrame(item_dict)
ca_comp_data.to_csv(
    "C:\\Users\\THINKPAD\\PycharmProjects\\MCM-ICM-2018-Problem-C\\code\\PCR\\ca_data.csv",
    index=False,
    index_label=False,
    sep=',')
item_dict["MSN"] = nm_msn
item_dict["StateCode"] = nm_state_code
item_dict["Year"] = nm_year
コード例 #8
0
        tblOut['ext_arm_len'].append( abs(ea.end-ea.start)+1 )
        tblOut['lig_arm_len'].append( abs(la.end-la.start)+1 )

        ## 

        tblOut['ext_nmerfreq'].append( ea.arm_mean_kmer_freq )
        tblOut['lig_nmerfreq'].append( la.arm_mean_kmer_freq )

        tblOut['ext_tm'].append( ea.arm_tm )
        tblOut['lig_tm'].append( la.arm_tm )

        tblOut['ext_mrfAndBwa_exact'].append( eah.num_exact_hits )
        tblOut['ext_mrfAndBwa_close'].append( eah.num_close_hits )
        tblOut['ext_mrfAndBwa_all'].append( eah.num_all_hits )

        tblOut['lig_mrfAndBwa_exact'].append( lah.num_exact_hits )
        tblOut['lig_mrfAndBwa_close'].append( lah.num_close_hits )
        tblOut['lig_mrfAndBwa_all'].append( lah.num_all_hits )

        tblOut['ext_gc'].append( ea.arm_gc )
        tblOut['lig_gc'].append( la.arm_gc )



        #


    tblOut = pd.DataFrame( tblOut )
    tblOut.to_csv( o.mipTableOut, sep='\t', index=False )
コード例 #9
0
def analyse_all_positions():
    session = db.Session()

    gameFrames = {}  # replayid:[(start,end),..]
    replays = session.query(db.Replay).all()
    for replay in replays:
        frames = get_game_frames(replay)
        gameFrames[replay.id] = frames

    # print(gameFrames)

    # count = session.query(db.Player.rplayer_id, sq.sql.func.count('*').label('replay_count')).group_by(db.Player.rplayer_id).subquery()

    # count_positions = session.query(db.Position, db.Player.rplayer_id, sq.sql.func.count('*').label('positions')).join(db.Player).group_by(db.Player.rplayer_id).subquery()
    # count_game_positions = session.query(db.Position, db.Player.rplayer_id, sq.sql.func.count('*').label('game_positions')).join(db.Player).filter(sq.or_(sq.and_(sq.or_(db.Position.frameNo.between(frames[0],frames[1]) for frames in gameFrames[replayid]),db.Position.replay_id==replayid) for replayid in gameFrames)).group_by(db.Player.rplayer_id).subquery()
    # count_half_attacking = session.query(db.Position, db.Player.rplayer_id, sq.sql.func.count('*').label('attacking_half')).join(db.Player).filter(db.Position.y>0).group_by(db.Player.rplayer_id).subquery()
    # count_half_defending = session.query(db.Position, db.Player.rplayer_id, sq.sql.func.count('*').label('defending_half')).join(db.Player).filter(db.Position.y<0).group_by(db.Player.rplayer_id).subquery()

    print('hi')

    a = time.time()
    rplayers = session.query(db.RPlayer).all()
    # i=0
    for rplayer in tqdm.tqdm(rplayers, leave=True):
        # i += 1
        # if i>2: break
        rpid = rplayer.id
        # print('\n',rid)
        pids, rids = zip(*session.query(db.Player.id,
                                        db.Player.replay_id).filter(db.Player.rplayer_id == rpid).all())
        print(pids, rids)

        _gameFrames = {}
        for rid in rids:
            _gameFrames[rid] = gameFrames[rid]

        # limit to 5 games
        _temp_gameFrames = {}
        i = 0
        for rid in rids:
            if i > 5:
                break
            _temp_gameFrames[rid] = gameFrames[rid]
            i += 1

        print(_temp_gameFrames)
        p = session.query(
            db.Position,
            db.BallFrame.x,
            db.BallFrame.y,
            db.BallFrame.z,
            db.Team.colour,
            # db.Velocity.speed
        )\
            .filter(db.Position.player_id.in_(pids))\
            .filter(sq.or_(sq.and_(sq.or_(db.Position.frameNo.between(frames[0], frames[1]) for frames in _temp_gameFrames[replayid]), db.Position.replay_id == replayid) for replayid in _temp_gameFrames))\
            .join(db.Player).join(db.Team)\
            .join(db.BallFrame, db.BallFrame.frame_id == db.Position.frame_id)\
            # .join(db.Velocity, sq.and_(db.Position.frame_id==db.Velocity.frame_id,db.Position.player_id==db.Velocity.player_id))\

        # velocity join takes a long time.
        # TESTING ABOVE
        # p = session.query(
        #     db.Position,
        #     db.BallFrame.x,
        #     db.BallFrame.y,
        #     db.BallFrame.z,
        #     db.Team.colour,
        #     db.Velocity.speed
        # )\
        #     .filter(db.Position.player_id.in_(pids))\
        #     .filter(sq.or_(sq.and_(sq.or_(db.Position.frameNo.between(frames[0],frames[1]) for frames in _gameFrames[replayid]),db.Position.replay_id==replayid) for replayid in _gameFrames))\
        #     .join(db.Velocity, sq.and_(db.Position.frame_id==db.Velocity.frame_id,db.Position.player_id==db.Velocity.player_id))\
        #     .join(db.Player).join(db.Team)\
        #     .join(db.BallFrame,db.BallFrame.frame_id==db.Position.frame_id)\

        # END REAL PART

        # p = session.query(db.Position,db.BallFrame.x,db.BallFrame.y,db.BallFrame.z,db.Team.colour)\
        # .filter(db.Position.player_id.in_(pids))\
        # .filter(sq.or_(sq.and_(sq.or_(db.Position.frameNo.between(frames[0],frames[1]) for frames in _gameFrames[replayid]),db.Position.replay_id==replayid) for replayid in _gameFrames))\
        # .join(db.Player).join(db.Team)\
        # .join(db.BallFrame,db.BallFrame.frame_id==db.Position.frame_id)\

        positions = pd.read_sql_query(
            p.selectable,
            db.engine
        )
        # print(positions.columns.values)
        # print('\n\n\n')
        # print(positions[:5])
        # print('\n\n\n')
        # print(positions.describe())

        print(positions, "!!!!!!!")
        _positions1 = analyse_position(positions)
        _positions2 = analyse_position_velocity(positions)

        # _positions = {**(_positions1), **_positions2}
        _positions = OrderedDict()
        for key, value in _positions1.items():
            _positions[key] = value
        for key, value in _positions2.items():
            _positions[key] = value
        # print(_positions)

        try:
            for key, value in _positions.items():
                positional_analysis[key].append(value)
            positional_analysis['name'].append(rplayer.name)
            positional_analysis['team'].append(rplayer.rteam.name)
            positional_analysis['games'].append(len(pids))
        except UnboundLocalError:
            positional_analysis = OrderedDict()
            positional_analysis['name'] = [rplayer.name]
            positional_analysis['team'] = [rplayer.rteam.name]
            positional_analysis['games'] = [len(pids)]
            for key, value in _positions.items():
                positional_analysis[key] = [value]

        # print(positional_analysis)

    positional_analysis = pd.DataFrame.from_dict(positional_analysis)

    # print(positional_analysis)

    # replay = pd.read_sql_query(
    # x.selectable,
    # db.engine
    # )

    # replay.columns = replay.columns.str.replace('^anon_[0-9]+_','')

    print('duration:', int(time.time() - a))
    with open("all_player_position_analysis1.txt", 'w') as f:
        positional_analysis.to_csv(f, index=False)

    print('done')
コード例 #10
0
class FractionTaxaBarStack(Graph):
    short_name = 'fraction_taxa_barstack'
    bottom = 0.4
    top = 0.95
    left = 0.1
    right = 0.95
    formats = ('pdf', 'eps')

    def plot(self):
        # Make Frame #
        self.frame = OrderedDict(
            (('%s - %s' % (p, f), getattr(p.fractions, f).rdp.phyla)
             for f in ('low', 'med', 'big') for p in self.parent.pools))
        self.frame = pandas.DataFrame(self.frame)
        self.frame = self.frame.fillna(0)
        # Rename #
        new_names = {
            u"run001-pool01 - low": "2-step PCR low",
            u"run001-pool02 - low": "2-step PCR low",
            u"run001-pool03 - low": "2-step PCR low",
            u"run001-pool04 - low": "1-step PCR low",
            u"run002-pool01 - low": "New chem low",
            u"run001-pool01 - med": "2-step PCR med",
            u"run001-pool02 - med": "2-step PCR med",
            u"run001-pool03 - med": "2-step PCR med",
            u"run001-pool04 - med": "1-step PCR med",
            u"run002-pool01 - med": "New chem med",
            u"run001-pool01 - big": "2-step PCR high",
            u"run001-pool02 - big": "2-step PCR high",
            u"run001-pool03 - big": "2-step PCR high",
            u"run001-pool04 - big": "1-step PCR high",
            u"run002-pool01 - big": "New chem high",
        }
        self.frame.rename(columns=new_names, inplace=True)
        self.frame = self.frame.transpose()
        # Group low abundant into 'others' #
        low_abundance = self.frame.sum() < 30000
        other_count = self.frame.loc[:, low_abundance].sum(axis=1)
        self.frame = self.frame.loc[:, ~low_abundance]
        self.frame['Others'] = other_count
        # Normalize #
        self.frame = self.frame.apply(lambda x: 100 * x / x.sum(), axis=1)
        # Sort the table by sum #
        sums = self.frame.sum()
        sums.sort(ascending=False)
        self.frame = self.frame.reindex_axis(sums.keys(), axis=1)
        # Plot #
        fig = pyplot.figure()
        axes = self.frame.plot(kind='bar', stacked=True, color=cool_colors)
        fig = pyplot.gcf()
        # Other #
        axes.set_ylabel('Relative abundances in percent')
        axes.xaxis.grid(False)
        axes.yaxis.grid(False)
        axes.set_ylim([0, 100])
        # Put a legend below current axis
        axes.legend(loc='upper center',
                    bbox_to_anchor=(0.5, -0.40),
                    fancybox=True,
                    shadow=True,
                    ncol=5,
                    prop={'size': 10})
        # Font size #
        axes.tick_params(axis='x', which='major', labelsize=11)
        # Save it #
        self.save_plot(fig, axes)
        self.frame.to_csv(self.csv_path)
        pyplot.close(fig)
コード例 #11
0
for i in variable1:
    if i not in variable:
        variable.append(i)
tx_data = OrderedDict()
for i in range(0, len(variable)):
    tx_data[variable[i]] = np.zeros(50)
for i in range(0, 50):
    for j in range(len(data)):
        if int(data["Year"][j]) - 1960 == i:
            if data["MSN"][j] in variable:
                tx_data[data["MSN"][j]][i] = data["Data"][j]
year = []
tx_comp_data = OrderedDict()
tx_comp_data = pd.DataFrame(tx_data)
tx_comp_data.to_csv("tx_data_by_year_original.csv",
                    index=False,
                    index_label=False,
                    sep=',')
for i in variable:
    if i != "TEGDS" and i != "Year":
        mean = np.mean(tx_data[i])
        std = np.std(tx_data[i])
        if std != 0:
            for j in range(len(tx_data[i])):
                tx_data[i][j] = (tx_data[i][j] - mean) / std

tx_comp_data = pd.DataFrame(tx_data)
tx_comp_data.to_csv("tx_data_by_year.csv",
                    index=False,
                    index_label=False,
                    sep=',')
コード例 #12
0
    def get_stats_from_coverage_file_stream_version(self, coverage_file, output_prefix, verbose=True,
                                                    scaffold_column=0, coverage_column=1,
                                                    separator="\t", buffering=None):

        stats = OrderedDict()
        summary_stats = OrderedDict()
        with self.metaopen(coverage_file, "r", buffering=buffering) as in_fd:
            line_list = in_fd.readline().strip().split(separator)
            scaffold, coverage = line_list[scaffold_column], int(line_list[coverage_column])
            coverage_dict = OrderedDict([(coverage, 1)])
            summary_coverage_dict = OrderedDict([(coverage, 1)])
            current_scaffold = scaffold
            line_counter = 1
            for line in in_fd:
                line_list = line.strip().split(separator)
                scaffold, coverage = line_list[scaffold_column], int(line_list[coverage_column])
                if coverage in summary_coverage_dict:
                    summary_coverage_dict[coverage] += 1
                else:
                    summary_coverage_dict[coverage] = 1
                line_counter += 1
                if line_counter % 1000000 == 0:
                    print("%s\tProcessed %i lines" % (str(datetime.datetime.now()), line_counter))
                if scaffold != current_scaffold:
                    #print(scaffold)
                    print("%s\tCalculating stats for %s" % (str(datetime.datetime.now()), current_scaffold))
                    stats[current_scaffold] = [sum(list(coverage_dict.values())),
                                               min(list(coverage_dict.keys())),
                                               max(list(coverage_dict.keys())),
                                               self.mean_from_dict(coverage_dict),
                                               self.median_from_dict(coverage_dict)]
                    coverage_dict = OrderedDict([(coverage, 1)])
                    current_scaffold = scaffold

                else:
                    if coverage in coverage_dict:
                        coverage_dict[coverage] += 1
                    else:
                        coverage_dict[coverage] = 1
            else:
                #print("END")
                #print(scaffold)
                stats[current_scaffold] = [sum(list(coverage_dict.values())),
                                           min(list(coverage_dict.keys())),
                                           max(list(coverage_dict.keys())),
                                           self.mean_from_dict(coverage_dict),
                                           self.median_from_dict(coverage_dict)]

        summary_stats["all"] = [sum(list(summary_coverage_dict.values())),
                                min(list(summary_coverage_dict.keys())),
                                max(list(summary_coverage_dict.keys())),
                                self.mean_from_dict(summary_coverage_dict),
                                self.median_from_dict(summary_coverage_dict)]

        #print(stats)
        stats = pd.DataFrame.from_dict(stats, orient="index", columns=["length", "min", "max", "mean", "median"])
        summary_stats = pd.DataFrame.from_dict(summary_stats, orient="index", columns=["length", "min", "max", "mean", "median"])
        stats.to_csv("%s.per_scaffold.stat" % output_prefix, sep="\t", index_label="#scaffold")
        summary_stats.to_csv("%s.all.stat" % output_prefix, sep="\t", index_label="#scaffold")
        if verbose:
            print(stats)
コード例 #13
0
y_final_pred = classifier.predict(x_testdata)

t = testing_data['PERID']

test = []
yp = []
for i in range(0, 11430):
    test.append(t[i])
    yp.append(y_final_pred[i])

from collections import OrderedDict

df = OrderedDict([('PERID', test), ('Criminal', yp)])
df = pd.DataFrame.from_dict(df)
df.to_csv('first.csv', index=False)

# n_estimators = 300 , max_depth = 3 , min_child_weight = 3

# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{'learning_rate': [0.1, 0.2, 0.3]}]
grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=10,
                           return_train_score=False)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
コード例 #14
0
    def get_coverage_stats_in_windows(self,
                                      coverage_file,
                                      window_size,
                                      output_prefix,
                                      window_step=None,
                                      buffering=None):
        win_step = window_size if window_step is None else window_step
        stats = []
        per_scaffold_stats = OrderedDict()
        coverage_dict = OrderedDict()
        summary_stats = OrderedDict()
        total_length = 0

        with self.metaopen(coverage_file, "r", buffering=buffering) as in_fd:
            prev_scaffold, start, end, coverage = in_fd.readline().strip(
            ).split()
            coverage_list = [int(coverage)] * (int(end) - int(start))
            for line in in_fd:
                current_scaffold, start, end, coverage = line.strip().split()
                if current_scaffold == prev_scaffold:
                    coverage_list += [int(coverage)] * (int(end) - int(start))
                else:
                    scaffold_length = len(coverage_list)
                    if scaffold_length >= window_size:
                        number_of_windows = int(
                            (scaffold_length - window_size) / win_step) + 1
                        for i in range(0, number_of_windows):
                            win_start = i * win_step
                            window_coverage_list = coverage_list[
                                win_start:win_start + window_size]
                            uncovered = window_coverage_list.count(0)
                            stats.append([
                                prev_scaffold, scaffold_length, i,
                                np.mean(window_coverage_list),
                                np.median(window_coverage_list),
                                np.min(window_coverage_list),
                                np.max(window_coverage_list), uncovered,
                                float(uncovered) / float(window_size)
                            ], )

                    coverage_array, count_array = np.unique(coverage_list,
                                                            return_counts=True)
                    for i in range(0, len(coverage_array)):
                        if coverage_array[i] in coverage_dict:
                            coverage_dict[coverage_array[i]] += count_array[i]
                        else:
                            coverage_dict[coverage_array[i]] = count_array[i]

                    per_scaffold_stats[prev_scaffold] = [
                        scaffold_length,
                        min(coverage_list),
                        max(coverage_list),
                        np.mean(coverage_list),
                        np.median(coverage_list)
                    ]

                    prev_scaffold = current_scaffold
                    coverage_list = [int(coverage)] * (int(end) - int(start))

                    total_length += scaffold_length

            scaffold_length = len(coverage_list)
            total_length += scaffold_length

            if scaffold_length >= window_size:
                number_of_windows = int(
                    (scaffold_length - window_size) / win_step) + 1
                for i in range(0, number_of_windows):
                    win_start = i * win_step
                    window_coverage_list = coverage_list[win_start:win_start +
                                                         window_size]
                    uncovered = window_coverage_list.count(0)
                    stats.append([
                        prev_scaffold, scaffold_length, i,
                        np.mean(window_coverage_list),
                        np.median(window_coverage_list),
                        np.min(window_coverage_list),
                        np.max(window_coverage_list), uncovered,
                        float(uncovered) / float(window_size)
                    ], )

            per_scaffold_stats[prev_scaffold] = [
                scaffold_length,
                min(coverage_list),
                max(coverage_list),
                np.mean(coverage_list),
                np.median(coverage_list)
            ]

            coverage_array, count_array = np.unique(coverage_list,
                                                    return_counts=True)
            for i in range(0, len(coverage_array)):
                if coverage_array[i] in coverage_dict:
                    coverage_dict[coverage_array[i]] += count_array[i]
                else:
                    coverage_dict[coverage_array[i]] = count_array[i]

        stats = pd.DataFrame.from_records(
            stats,
            index=("scaffold", "window"),
            columns=("scaffold", "scaffold_length", "window", "mean", "median",
                     "min", "max", "uncovered", "uncovered,fraction"))

        summary_stats["all"] = [
            total_length,
            min(list(coverage_dict.keys())),
            max(list(coverage_dict.keys())),
            self.mean_from_dict(coverage_dict),
            self.median_from_dict(coverage_dict)
        ]

        summary_stats = pd.DataFrame.from_dict(
            summary_stats,
            orient="index",
            columns=["length", "min", "max", "mean", "median"])

        per_scaffold_stats = pd.DataFrame.from_dict(
            per_scaffold_stats,
            orient="index",
            columns=["length", "min", "max", "mean", "median"])

        stats.to_csv("{0}.win{1}.step{2}.stat".format(output_prefix,
                                                      window_size, win_step),
                     sep="\t",
                     header=True,
                     index=True)
        summary_stats.to_csv("%s.all.stat" % output_prefix,
                             sep="\t",
                             index_label="#scaffold")
        per_scaffold_stats.to_csv("%s.per_scaffold.stat" % output_prefix,
                                  sep="\t",
                                  index_label="#scaffold")
コード例 #15
0
te_data = OrderedDict()
item_dict = OrderedDict()
item_dict["MSN"] = te_msn
item_dict["Description"] = te_description
item_dict["Unit"] = te_unit
te_data = pd.DataFrame(item_dict)

tn_data = OrderedDict()
item_dict = OrderedDict()
item_dict["MSN"] = tn_msn
item_dict["Description"] = tn_description
item_dict["Unit"] = tn_unit
tn_data = pd.DataFrame(item_dict)

# data_frame.to_csv("C:\\Users\\THINKPAD\\PycharmProjects\\MCM-ICM2018\\data\\test.csv",index=False,index_label=False,sep=',')
comp_data.to_csv("data/csv/total_sector.csv",
                 index=False,
                 index_label=False,
                 sep=',')
tn_data.to_csv("data/csv/tn_sector.csv",
               index=False,
               index_label=False,
               sep=',')
te_data.to_csv("data/csv/te_sector.csv",
               index=False,
               index_label=False,
               sep=',')
print(comp_data)
print(tn_data)
print(te_data)
コード例 #16
0
b_names = [
    'TRAIN_TT', 'SM_TT', 'CAR_TT', 'TRAIN_HE', 'SM_HE', 'SM_SEATS',
    'TRAIN_ONE', 'SM_ONE'
]
b_mean = OrderedDict(zip(b_names, b.mean(0)))
b_std = OrderedDict(zip(b_names, b.std(0)))

b_max = OrderedDict(zip(b_names, b.max(0)[0]))
b_min = OrderedDict(zip(b_names, b.min(0)[0]))

b_mean = bToDataFrame(b_mean)
b_std = bToDataFrame(b_std)
b_max = bToDataFrame(b_max)
b_min = bToDataFrame(b_min)

b_mean.to_csv(args.result_path + "/" + "b_mean.csv")
b_std.to_csv(args.result_path + "/" + "b_std.csv")
b_max.to_csv(args.result_path + "/" + "b_max.csv")
b_min.to_csv(args.result_path + "/" + "b_min.csv")

print "\nb_mean"
print b_mean

print "\nb_std"
print b_std

print "\nb_max"
print b_max

print "\nb_min"
print b_min
コード例 #17
0
file_in = sys.argv[1]
file_out1 = sys.argv[2]
file_out2 = sys.argv[3]
file_out3 = sys.argv[4]

K = 5

df = pd.read_parquet(file_in)
rand_idps = np.random.randint(0, df.shape[1] - 1, size=K) + 1  # the offset is indiv col
pheno_names = []
out = OrderedDict()
out['indiv'] = df.indiv
for i, k in enumerate(list(rand_idps)):
    name = '{}_{}'.format(i, df.columns[k])
    tmp = df.iloc[:, k].values
    tmp = tmp - tmp.mean()
    out[name] = tmp + np.random.normal(scale=4 * tmp.std(), size=df.shape[0]) + np.random.randint(-5, 5, size=1)
    name = '{}_null'.format(i, df.columns[k])
    out[name] = np.random.normal(scale=4 * tmp.std(), size=df.shape[0]) + np.random.randint(-5, 5, size=1)
out = pd.DataFrame(out)
out.to_csv(file_out1, index=False)

with open(file_out2, 'w') as f:
    yaml.dump({ k: 'linear_regression' for k in out.keys()[1:] }, f)

with open(file_out3, 'w') as f:
    yaml.dump({ k: 'susie' for k in out.keys()[1:] }, f)
    

コード例 #18
0
                                  verbose=not args.Q)
    read_stats['tag'] = tag
    base_stats = read_stats['base_stats']
    precision_stats = read_stats['read_stats']

    base_stats_qc(base_stats, plotter)
    modes = read_precision_qc(precision_stats, plotter)

    plotter.close()

    global_stats = OrderedDict([
        ('Accuracy', [read_stats['base_stats']['accuracy']]),
        ('AccuracyMode', modes['accuracy_mode']),
        ('Identity', [read_stats['base_stats']['identity']]),
        ('IdentityMode', modes['identity_mode']),
        ('Mapped', [read_stats['mapped']]),
        ('Unmapped', [read_stats['unmapped']]),
        ('Tag', [read_stats['tag']]),
    ])
    global_stats = pd.DataFrame(global_stats)

    if args.g is not None:
        global_stats.to_csv(args.g, sep="\t", index=False)

    if args.l is not None:
        read_df = pd.DataFrame(precision_stats)
        read_df.to_csv(args.l, sep="\t", index=False)

    if args.p is not None:
        misc.pickle_dump(read_stats, args.p)
コード例 #19
0
                    ]
                    for o in onlyfileszn:
                        if '.csv' not in o:
                            i2 = onlyfileszn.index(o)
                            path5 = path4 + '/biextracted/' + str(i2)
                            zip2 = zipfile.ZipFile(path4 + '/' + o)
                            zip2.extractall(path5)
                            #inside_path5 = [iif for iif in listdir(path5) if isfile(join(path5, iif))]
                            inside_path5 = zip2.namelist()
                            T1 = [di for di in inside_path5 if 'T1' in di]
                            ### get date and pod
                            for T in T1:
                                dt = datetime.datetime(y, int(T[2:4]),
                                                       int(T[5:7]))
                                pod = T[T.find('_') + 1:T.find('.csv')]
                                s = zip2.read(T)
                                #                            t1df = pd.read_csv(path5 + '/' + T1, sep = ';', dtype = object)
                                todiz = [pod, dt, s[548:741]]
                                #                            todiz.extend(Aggregator(t1df).tolist())
                                df[count] = todiz
                                count += 1
    #### http://stackoverflow.com/questions/303200/how-do-i-remove-delete-a-folder-that-is-not-empty-with-python
                            zip2.close()
                    shutil.rmtree(path5)
                zip_ref.close()
                shutil.rmtree(path4)
    df = pd.DataFrame.from_dict(df, orient='index')
    df.to_csv('Hdatabase_' + str(y), sep=';')
    del df
#### copy all files into a new directory and then operate in the new directory
#### https://docs.python.org/2/library/shutil.html
コード例 #20
0
        tx_state_code.append(seseds["StateCode"][i])
        tx_year.append(seseds["Year"][i])
        tx_data.append(seseds["Data"][i])
az_comp_data = OrderedDict()
ca_comp_data = OrderedDict()
nm_comp_data = OrderedDict()
tx_comp_data = OrderedDict()
item_dict = OrderedDict()
item_dict["MSN"] = az_msn
item_dict["StateCode"] = az_state_code
item_dict["Year"] = az_year
item_dict["Data"] = az_data
az_comp_data = pd.DataFrame(item_dict)
az_comp_data.to_csv(
    "C:/Users/THINKPAD/PycharmProjects/MCM-ICM-2018-Problem-C/data/csv/az_data.csv",
    index=False,
    index_label=False,
    sep=',')
item_dict["MSN"] = ca_msn
item_dict["StateCode"] = ca_state_code
item_dict["Year"] = ca_year
item_dict["Data"] = ca_data
ca_comp_data = pd.DataFrame(item_dict)
ca_comp_data.to_csv(
    "C:/Users/THINKPAD/PycharmProjects/MCM-ICM-2018-Problem-C/data/csv/ca_data.csv",
    index=False,
    index_label=False,
    sep=',')
item_dict["MSN"] = nm_msn
item_dict["StateCode"] = nm_state_code
item_dict["Year"] = nm_year
コード例 #21
0
                                dt = datetime.datetime(y, int(T[2:4]),
                                                       int(T[5:7]))
                                pod = T[T.find('_') + 1:T.find('.csv')]
                                s = zip2.read(T)
                                #                            t1df = pd.read_csv(path5 + '/' + T1, sep = ';', dtype = object)
                                todiz = [pod, dt, s[548:741]]
                                #                            todiz.extend(Aggregator(t1df).tolist())
                                df[count] = todiz
                                count += 1
    #### http://stackoverflow.com/questions/303200/how-do-i-remove-delete-a-folder-that-is-not-empty-with-python
                            zip2.close()
                    shutil.rmtree(path5)
                zip_ref.close()
                shutil.rmtree(path4)
    df = pd.DataFrame.from_dict(df, orient='index')
    df.to_csv('Hdatabase_' + str(y), sep=';')
    del df
#### copy all files into a new directory and then operate in the new directory
#### https://docs.python.org/2/library/shutil.html

###############################################################################

extracter = 'C:/Users/d_floriello/Desktop/tbe2'
extracter = 'H:/Energy Management/02. EDM/01. MISURE/3. DISTRIBUTORI/ENEL Distribuzione S.p.A/2017/2017-03/giornalieri/csv'
onlyfiles = [f for f in listdir(extracter) if isfile(join(extracter, f))]

for of in onlyfiles:
    if '.zip' in of:
        zip_ref = zipfile.ZipFile(extracter + '/' + of)
        zip_ref.extractall(extracter)
        zip_ref.close()
コード例 #22
0
ファイル: print_table.py プロジェクト: pcko1/moses
        max_val = (2 * d - 1) * np.max(
            [(2 * d - 1) * m
             for m, n in zip(metrics[col], metrics['Model']) if n != 'Train'])
        metrics[col] = [
            str(x) if x != max_val or n == 'Train' else bf_pattern.format(x)
            for x, n in zip(metrics[col], metrics['Model'])
        ]
    for col in targets[::-1]:
        metrics[col] = [
            it_pattern.format(x) if n == 'Train' else x
            for x, n in zip(metrics[col], metrics['Model'])
        ]

    metrics = metrics.round(config.precision)
    if config.extension == 'csv':
        metrics.to_csv(config.output, index=None)
    elif config.extension == 'html':
        html = metrics.to_html(index=None)
        html = re.sub('&lt;', '<', html)
        html = re.sub('&gt;', '>', html)
        header, footer = html.split('</thead>')
        header += '</thead>'
        header = header.split('\n')
        values = [x.strip()[4:-5] for x in header[3:-2]]
        spans = ['rowspan' if '/' not in x else 'colspan' for x in values]
        first_header = [x.split('/')[0] for x in values]
        second_header = [x.split('/')[1] for x in values if '/' in x]
        new_header = header[:3]
        i = 0
        total = 0
        while i < len(first_header):
コード例 #23
0
ファイル: performance.py プロジェクト: rurban/randomgen
        repeat=REPEAT,
    )
    col[key] = 1000 * min(t)
table["NumPy"] = pd.Series(col)
final = table

func_list = list(funcs.keys())
table = pd.DataFrame(final)
table = table.reindex(table.mean(1).sort_values().index)
order = np.log(table).mean().sort_values().index
table = table.T
table = table.reindex(order, axis=0)
table = table.reindex(func_list, axis=1)
table = 1000000 * table / (SIZE * NUMBER)
table.index.name = "Bit Gen"
print(table.to_csv(float_format="%0.1f"))

try:
    from tabulate import tabulate

    perf = table.applymap(lambda v: "{0:0.1f}".format(v))
    print(tabulate(perf, headers="keys", tablefmt="rst"))
except ImportError:
    pass

table = table.T
rel = table.loc[:, ["NumPy"]].values @ np.ones((1, table.shape[1])) / table
rel.pop("NumPy")
rel = rel.T
rel["Overall"] = np.exp(np.log(rel).mean(1))
rel *= 100
コード例 #24
0
    opts.add_argument('--mipGenOut', default=None, dest='mipGenOut')

    o = opts.parse_args()

    pairStore = pd.HDFStore(o.inStorePairs, 'r')
    tblPairs = pairStore[o.tablePairs]

    (tblnameExtArm, tblnameLigArm) = pairStore.get_storer(
        o.tablePairs).attrs['mippipe__ext_and_lig_arm_tables']

    armStore = pd.HDFStore(o.inStoreArms, 'r')
    tblArmExt = armStore[tblnameExtArm]
    tblArmLig = armStore[tblnameLigArm]

    tblOut = OrderedDict([(col, []) for col in [
        'mip_key', 'chr', 'ext_probe_start', 'ext_probe_stop',
        'lig_probe_start', 'lig_probe_stop'
    ]])

    for i, r in tblPairs.iterrows():
        tblOut['mip_key'].append(int(i))
        tblOut['chr'].append(r.chrom)
        tblOut['ext_probe_start'].append(tblArmExt.ix[r.extarmidx, 'start'])
        tblOut['ext_probe_stop'].append(tblArmExt.ix[r.extarmidx, 'end'])
        tblOut['lig_probe_start'].append(tblArmLig.ix[r.ligarmidx, 'start'])
        tblOut['lig_probe_stop'].append(tblArmLig.ix[r.ligarmidx, 'end'])

    tblOut = pd.DataFrame(tblOut)
    tblOut.to_csv(o.mipGenOut, sep='\t', index=False)
コード例 #25
0
###Extract specifications for all unique cars
search_texts = ['Car', 'Power', 'Torque', 'Car type', 'Curb weight', 'Dimensions', 'Wheelbase', \
"Power / weight", 'Introduced', 'Origin country', 'Engine type', 'Displacement', \
"Power / liter", 'Transmission', 'Layout', 'Top speed', "1/4 mile", 'car URL']

#Make an ordered list of tuples to create a dictionary with ordered keys
car_variables = [ ('Car',[]), ('Power', []), ('Torque', []), ('Type',[]), ('Weight', []), ('Dimensions', []), \
('Wheelbase',[]), ('Power Per Weight', []), ('Year Model', []), ('Country', []), ('Engine Type', []), ('Displacement', []), \
('Power Per Liter', []), ('Transmission', []), ('Layout', []), ('Top Speed', []), ('Quarter Mile Time', []), ('Car URL', [])]
car_specs = OrderedDict(car_variables)

for i in cars.index.values:
    #Go to each car specs HTML page
    car_url = "http://" + cars.iloc[i, 1]
    r = requests.get(car_url)
    soup = BeautifulSoup(r.text)
    #Extract each car specs
    car_specs['Car'].append(cars.iloc[i,0])
    car_specs['Car URL'].append(car_url)
    for idx in range(1,3): #Extract power and torque info
        try: car_specs[car_specs.keys()[idx]].append(soup.find(text=search_texts[idx]).findNext('td').find('a').string)
        except: car_specs[car_specs.keys()[idx]].append(None) #Return null if blank
    for idx in range(3, len(car_variables)-1): #Extract info for everything else
        try: car_specs[car_specs.keys()[idx]].append(soup.find(text=search_texts[idx]).findNext('td').string)
        except: car_specs[car_specs.keys()[idx]].append(None) #Return null if blank

#Transform the dictionary into a dataframe
car_specs = pd.DataFrame(car_specs)
#Save the dataframe into a CSV file
car_specs.to_csv('cars_specifications.csv', index=False, encoding = 'UTF-8')
コード例 #26
0
from numpy.random import RandomState
rg = RandomState()
"""
col = {}
for key in npfuncs:
    t = repeat(test.format(func=npfuncs[key]),
               setup.format(prng=prng().__class__.__name__),
               number=1,
               repeat=3)
    col[key] = 1000 * min(t)
table['RandomState'] = pd.Series(col)

table = pd.DataFrame(table)
table = table.reindex(table.mean(1).sort_values().index)
order = np.log(table).mean().sort_values().index
table = table.T
table = table.reindex(order)
table = table.T
table = table.reindex([k for k in funcs], axis=0)
print(table.to_csv(float_format='%0.1f'))

rel = table.loc[:, ['RandomState']].values @ np.ones(
    (1, table.shape[1])) / table
rel.pop('RandomState')
rel = rel.T
rel['Overall'] = np.exp(np.log(rel).mean(1))
rel *= 100
rel = np.round(rel)
rel = rel.T
print(rel.to_csv(float_format='%0d'))
コード例 #27
0
    'xoroshiro128plus': 'xoroshiro128+',
    'xorshift1024': 'xorshift1024',
    'pcg64': 'PCG64',
    'mt19937': 'MT19937',
    'random': 'NumPy MT19937'
}

results.columns = [cols[c] for c in results]
results.index = [index[i] for i in results.index]

print(results)

from io import StringIO

sio = StringIO()
results.to_csv(sio)
sio.seek(0)
lines = sio.readlines()
for i, line in enumerate(lines):
    if i == 0:
        line = '    :header: ' + line
    else:
        line = '    ' + line
    lines[i] = line

lines.insert(1, '    \n')
lines.insert(1, '    :widths: 14,14,14,14,14,14,14,14\n')
lines.insert(0, '.. csv-table::\n')
print(''.join(lines))

std_results = (results.T / results.iloc[:, -3]).T
コード例 #28
0
def reducer(key, values):
    # key : string of intermediary key
    # load return dict correspondning to mapper ouput. they need to be loaded.
    # DEBUG
    import mapreduce as GLOBAL
    output_permutations = GLOBAL.OUTPUT_PERMUTATIONS
    map_output = GLOBAL.MAP_OUTPUT
    output_path = GLOBAL.OUTPUT_PATH
    roi = GLOBAL.ROI
    BASE = os.path.join("/neurospin/brainomics/2014_deptms/results_enettv/",
                        "MRI_" + roi,
                        map_output)
    INPUT = BASE + "/%i/%s"
    OUTPUT = BASE + "/../" + output_path
    if not os.path.exists(OUTPUT):
        os.makedirs(OUTPUT)
    criteria = GLOBAL.CRITERIA
    keys = ['_'.join(str(e) for e in a) for a in criteria]
    OK = 0
    # params = criteria = ['recall_mean', 'min_recall', 'max_pvalue_recall',
    #                     'accuracy', 'pvalue_accuracy']
    if not OK:
        for key in keys:
            print "key: ", key
            paths_CV_all = [INPUT % (perm, key) \
                    for perm in xrange(NFOLDS * NRNDPERMS)]
            idx_CV_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS)
            recall_0_perms = np.zeros(NRNDPERMS)
            recall_1_perms = np.zeros(NRNDPERMS)
            recall_mean_perms = np.zeros(NRNDPERMS)
            accuracy_perms = np.zeros(NRNDPERMS)
            auc_perms = np.zeros(NRNDPERMS)
            crit = key[0:len(key):2]
            if not os.path.isfile(OUTPUT + \
                                  "/perms_validation_" + crit + ".npz"):
                for perm in xrange(NRNDPERMS):
                    print "perm: ", perm
                    paths_CV_blocks = paths_CV_all[idx_CV_blocks[perm]:\
                                                    idx_CV_blocks[perm + 1]]
                    values = [GLOBAL.OutputCollector(p) \
                                for p in paths_CV_blocks]
                    values = [item.load() for item in values]
                    y_true = [item["y_true"].ravel() for item in values]
                    y_pred = [item["y_pred"].ravel() for item in values]
                    prob_pred = [item["proba_pred"].ravel() for item in values]
                    y_true = np.concatenate(y_true)
                    y_pred = np.concatenate(y_pred)
                    prob_pred = np.concatenate(prob_pred)
                    p, r, f, s = precision_recall_fscore_support(y_true,
                                                                 y_pred,
                                                                 average=None)
                    auc = roc_auc_score(y_true, prob_pred)
                    success = r * s
                    success = success.astype('int')
                    accuracy = (r[0] * s[0] + r[1] * s[1])
                    accuracy = accuracy.astype('int')
                    recall_0_perms[perm] = r[0]
                    recall_1_perms[perm] = r[1]
                    recall_mean_perms[perm] = r.mean()
                    accuracy_perms[perm] = accuracy / float(s[0] + s[1])
                    auc_perms[perm] = auc
                # END PERMS
                print "save", crit
                np.savez_compressed(OUTPUT + \
                                    "/perms_validation_" + crit + ".npz",
                                recall_0=recall_0_perms,
                                recall_1=recall_1_perms,
                                recall_mean=recall_mean_perms,
                                accuracy=accuracy_perms,
                                auc=auc_perms)
        OK = 1
    #pvals
    if  not os.path.isfile(os.path.join(OUTPUT, output_permutations)):
        print "Derive p-values"
        perms = dict()
        for i, key in enumerate(keys):
            print "crit: ", crit
            crit = key[0:len(key):2]
            perms[crit] = np.load(OUTPUT + \
                                    "/perms_validation_" + crit + ".npz")
        print keys
        [recall_mean, min_recall, accuracy] = [keys[0][0:len(keys[0]):2],
                                               keys[1][0:len(keys[1]):2],
                                               keys[2][0:len(keys[2]):2]]
        print [recall_mean, min_recall, accuracy]
        # Read true scores
        true = pd.read_csv(os.path.join(BASE, "..",
                                        "results_dCV_validation.csv"))
        true_recall_mean = true[true.params == recall_mean].iloc[0]
        true_min_recall = true[true.params == min_recall].iloc[0]
        true_accuracy = true[true.params == accuracy].iloc[0]
        # pvals corrected for multiple comparisons
        nperms = float(len(perms[recall_mean]['recall_0']))
        from collections import OrderedDict
        pvals = OrderedDict()
        #cond: criterion used to select the model
        pvals["cond"] = ['recall_mean'] * 5 + ['min_recall'] * 5 + \
                        ['accuracy'] * 5
        #stat: statitics associated to the p-value
        pvals["stat"] = ['recall_0', 'recall_1', 'recall_mean',
                         'accuracy', 'auc'] * 3
        pvals["pval"] = [
        np.sum(perms[recall_mean]['recall_0'] > true_recall_mean["recall_0"]),
        np.sum(perms[recall_mean]['recall_1'] > true_recall_mean["recall_1"]),
        np.sum(perms[recall_mean]['recall_mean'] > true_recall_mean["recall_mean"]),
        np.sum(perms[recall_mean]['accuracy'] > true_recall_mean["accuracy"]),
        np.sum(perms[recall_mean]['auc'] > true_recall_mean["auc"]),
    
        np.sum(perms[min_recall]['recall_0'] > true_min_recall["recall_0"]),
        np.sum(perms[min_recall]['recall_1'] > true_min_recall["recall_1"]),
        np.sum(perms[min_recall]['recall_mean'] > true_min_recall["recall_mean"]),
        np.sum(perms[min_recall]['accuracy'] > true_min_recall["accuracy"]),
        np.sum(perms[min_recall]['auc'] > true_min_recall["auc"]),
    
        np.sum(perms[accuracy]['recall_0'] > true_accuracy["recall_0"]),
        np.sum(perms[accuracy]['recall_1'] > true_accuracy["recall_1"]),
        np.sum(perms[accuracy]['recall_mean'] > true_accuracy["recall_mean"]),
        np.sum(perms[accuracy]['accuracy'] > true_accuracy["accuracy"]),
        np.sum(perms[accuracy]['auc'] > true_accuracy["auc"])]
    
        pvals = pd.DataFrame(pvals)
        pvals["pval"] /= float(nperms)
        pvals.to_csv(os.path.join(OUTPUT, output_permutations),
                     index=False)
    return {}
コード例 #29
0
def reducer_(key, values):
    # key : string of intermediary key
    # load return dict correspondning to mapper ouput. they need to be loaded.
    # DEBUG
    import glob, mapreduce
    BASE = "/neurospin/brainomics/2013_adni/ADAS11-MCIc-CTL/rndperm"
    INPUT = BASE + "/%i/%s"
    OUTPUT = BASE + "/../results/rndperm"
    keys = ["0.001_0.3335_0.3335_0.333_-1",  "0.001_0.5_0_0.5_-1",  "0.001_0.5_0.5_0_-1",  "0.001_1_0_0_-1"]
    for key in keys:
        #key = keys[0]
        paths_5cv_all = [INPUT % (perm, key) for perm in xrange(NFOLDS * NRNDPERMS)]
        idx_5cv_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS)
        cpt = 0
        qc = dict()
        r2_perms = np.zeros(NRNDPERMS)
        corr_perms = np.zeros(NRNDPERMS)
        r_bar_perms = np.zeros(NRNDPERMS)
        fleiss_kappa_stat_perms = np.zeros(NRNDPERMS)
        dice_bar_perms = np.zeros(NRNDPERMS)
        for perm_i in xrange(len(idx_5cv_blocks)-1):
            paths_5cv = paths_5cv_all[idx_5cv_blocks[perm_i]:idx_5cv_blocks[perm_i+1]]
            for p in paths_5cv:
                if os.path.exists(p) and not(p in qc):
                    if p in qc:
                        qc[p] += 1
                    else:
                        qc[p] = 1
                    cpt += 1
            #
            values = [mapreduce.OutputCollector(p) for p in paths_5cv]
            values = [item.load() for item in values]
            y_true = [item["y_true"].ravel() for item in values]
            y_pred = [item["y_pred"].ravel() for item in values]
            y_true = np.concatenate(y_true)
            y_pred = np.concatenate(y_pred)
            r2 = r2_score(y_true, y_pred)
            corr = np.corrcoef(y_true.ravel(), y_pred.ravel())[0, 1]
            betas = np.hstack([item["beta"] for item in values]).T
            #
            ## Compute beta similarity measures
            #
            # Correlation
            R = np.corrcoef(betas)
            R = R[np.triu_indices_from(R, 1)]
            # Fisher z-transformation / average
            z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
            # bracktransform
            r_bar = (np.exp(2 * z_bar) - 1) /  (np.exp(2 * z_bar) + 1)
            #
            # threshold betas to compute fleiss_kappa and DICE
            try:
                betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in xrange(betas.shape[0])])
                print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1))
                print np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5,
                                   rtol=0, atol=1e-02)
                #
                # Compute fleiss kappa statistics
                beta_signed = np.sign(betas_t)
                table = np.zeros((beta_signed.shape[1], 3))
                table[:, 0] = np.sum(beta_signed == 0, 0)
                table[:, 1] = np.sum(beta_signed == 1, 0)
                table[:, 2] = np.sum(beta_signed == -1, 0)
                fleiss_kappa_stat = fleiss_kappa(table)
                #
                # Paire-wise Dice coeficient
                beta_n0 = betas_t != 0
                ij = [[i, j] for i in xrange(5) for j in xrange(i+1, 5)]
                #print [[idx[0], idx[1]] for idx in ij]
                dice_bar = np.mean([float(np.sum(beta_signed[idx[0], :] == beta_signed[idx[1], :])) /\
                     (np.sum(beta_n0[idx[0], :]) + np.sum(beta_n0[idx[1], :]))
                     for idx in ij])
            except:
                dice_bar = fleiss_kappa_stat = 0.
            #
            r2_perms[perm_i] = r2
            corr_perms[perm_i] = corr
            r_bar_perms[perm_i] = r_bar
            fleiss_kappa_stat_perms[perm_i] = fleiss_kappa_stat
            dice_bar_perms[perm_i] = dice_bar
        # END PERMS
        print "save", key
        np.savez_compressed(OUTPUT+"/perms_"+key+".npz",
                            r2=r2_perms, corr=corr_perms,
                            r_bar=r_bar_perms, fleiss_kappa=fleiss_kappa_stat_perms,
                            dice_bar=dice_bar_perms)
        #
        perms = dict()
        fig, axis = plt.subplots(len(keys), 4)#, sharex='col')
        for i, key in enumerate(keys):
            perms[key] = np.load(OUTPUT+"/perms_"+key+".npz")
            n, bins, patches = axis[i, 0].hist(perms[key]['r2'], 50, normed=1, histtype='stepfilled')
            axis[i, 0].set_title(key + "_r2")
            n, bins, patches = axis[i, 1].hist(perms[key]['r_bar'], 50, normed=1, histtype='stepfilled')
            axis[i, 1].set_title(key + "_r_bar")
            n, bins, patches = axis[i, 2].hist(perms[key]['fleiss_kappa'], 50, histtype='stepfilled')
            axis[i, 2].set_title(key + "_fleiss_kappa")
            n, bins, patches = axis[i, 3].hist(perms[key]['dice_bar'], 50)#, 50, normed=1, histtype='stepfilled')
            axis[i, 3].set_title(key + "_dice_bar")
        plt.show()

        l1l2tv, l1tv, l1l2, l1 = ["0.001_0.3335_0.3335_0.333_-1",  "0.001_0.5_0_0.5_-1",  
                             "0.001_0.5_0.5_0_-1",  "0.001_1_0_0_-1"]

        # Read true scores
        import pandas as pd
        true = pd.read_csv(os.path.join(BASE, "..", "ADAS11-MCIc-CTL.csv"))
        true = true[true.a == 0.001]
        true_l1l2tv = true[true.l1 == 0.3335].iloc[0]
        true_l1l2 = true[(true.l1 == 0.5) & (true.l2 == 0.5)].iloc[0]
        true_l1tv = true[(true.l1 == 0.5) & (true.tv == 0.5)].iloc[0]
        true_l1 = true[(true.l1 == 1.)].iloc[0]

        # pvals
        nperms = float(len(perms[l1]['r2']))
        from collections import OrderedDict
        pvals = OrderedDict()
        pvals["cond"] = ['l1', 'l1tv', 'l1l2', 'l1l2tv'] * 4 + \
                ['l1 vs l1tv'] * 4  + ['l1l2 vs l1l2tv'] * 4
        pvals["stat"] = ['r2'] * 4 + ['r_bar'] * 4 + ['fleiss_kappa'] * 4 + ['dice_bar'] * 4 +\
                ['r2', 'r_bar', 'fleiss_kappa', 'dice_bar'] * 2
        pvals["pval"] = [
            np.sum(perms[l1]['r2'] > true_l1["r2"]),
            np.sum(perms[l1tv]['r2'] > true_l1tv["r2"]),
            np.sum(perms[l1l2]['r2'] > true_l1l2["r2"]),
            np.sum(perms[l1l2tv]['r2'] > true_l1l2tv["r2"]),
    
            np.sum(perms[l1]['r_bar'] > true_l1["beta_r_bar"]),
            np.sum(perms[l1tv]['r_bar'] > true_l1tv["beta_r_bar"]),
            np.sum(perms[l1l2]['r_bar'] > true_l1l2["beta_r_bar"]),
            np.sum(perms[l1l2tv]['r_bar'] > true_l1l2tv["beta_r_bar"]),
    
            np.sum(perms[l1]['fleiss_kappa'] > true_l1["beta_fleiss_kappa"]),
            np.sum(perms[l1tv]['fleiss_kappa'] > true_l1tv["beta_fleiss_kappa"]),
            np.sum(perms[l1l2]['fleiss_kappa'] > true_l1l2["beta_fleiss_kappa"]),
            np.sum(perms[l1l2tv]['fleiss_kappa'] > true_l1l2tv["beta_fleiss_kappa"]),
    
            np.sum(perms[l1]['dice_bar'] > true_l1["beta_dice_bar"]),
            np.sum(perms[l1tv]['dice_bar'] > true_l1tv["beta_dice_bar"]),
            np.sum(perms[l1l2]['dice_bar'] > true_l1l2["beta_dice_bar"]),
            np.sum(perms[l1l2tv]['dice_bar'] > true_l1l2tv["beta_dice_bar"]),
    
            # l1 vs l1tv
            np.sum((perms[l1tv]['r2'] - perms[l1]['r2']) > (true_l1tv["r2"] - true_l1["r2"])),
            np.sum((perms[l1tv]['r_bar'] - perms[l1]['r_bar']) > (true_l1tv["beta_r_bar"] - true_l1["beta_r_bar"])),
            np.sum((perms[l1tv]['fleiss_kappa'] - perms[l1]['fleiss_kappa']) > (true_l1tv["beta_fleiss_kappa"] - true_l1["beta_fleiss_kappa"])),
            np.sum((perms[l1tv]['dice_bar'] - perms[l1]['dice_bar']) > (true_l1tv["beta_dice_bar"] - true_l1["beta_dice_bar"])),
    
            # l1l2 vs l1l2tv
            np.sum((perms[l1l2]['r2'] - perms[l1l2tv]['r2']) > (true_l1l2["r2"] - true_l1l2tv["r2"])),
            np.sum((perms[l1l2tv]['r_bar'] - perms[l1l2]['r_bar']) > (true_l1l2tv["beta_r_bar"] - true_l1l2["beta_r_bar"])),
            np.sum((perms[l1l2tv]['fleiss_kappa'] - perms[l1l2]['fleiss_kappa']) > (true_l1l2tv["beta_fleiss_kappa"] - true_l1l2["beta_fleiss_kappa"])),
            np.sum((perms[l1l2tv]['dice_bar'] - perms[l1l2]['dice_bar']) > (true_l1l2tv["beta_dice_bar"] - true_l1l2["beta_dice_bar"]))]

        pvals = pd.DataFrame(pvals)
        pvals["pval"] /= nperms
        pvals.to_csv(os.path.join(OUTPUT, "pvals_stats_permutations.csv"), index=False)
コード例 #30
0
ファイル: eval_plots.py プロジェクト: Xiuying/illumitag
class FractionTaxaBarStack(Graph):
    """This is figure 3 of the paper"""

    short_name = 'fraction_taxa_barstack'
    bottom = 0.4
    top = 0.95
    left = 0.1
    right = 0.95
    formats = ('pdf', 'eps')

    def plot(self):
        # Make Frame #
        self.frame = OrderedDict((('%s - %s' % (p,f), getattr(p.fractions, f).rdp.phyla)
                     for f in ('low', 'med', 'big') for p in self.parent.pools))
        self.frame = pandas.DataFrame(self.frame)
        self.frame = self.frame.fillna(0)
        # Rename #
        new_names = {
            u"run001-pool01 - low": "2-step PCR low",
            u"run001-pool02 - low": "2-step PCR low",
            u"run001-pool03 - low": "2-step PCR low",
            u"run001-pool04 - low": "1-step PCR low",
            u"run002-pool01 - low": "New chem low",
            u"run001-pool01 - med": "2-step PCR med",
            u"run001-pool02 - med": "2-step PCR med",
            u"run001-pool03 - med": "2-step PCR med",
            u"run001-pool04 - med": "1-step PCR med",
            u"run002-pool01 - med": "New chem med",
            u"run001-pool01 - big": "2-step PCR high",
            u"run001-pool02 - big": "2-step PCR high",
            u"run001-pool03 - big": "2-step PCR high",
            u"run001-pool04 - big": "1-step PCR high",
            u"run002-pool01 - big": "New chem high",
        }
        self.frame.rename(columns=new_names, inplace=True)
        self.frame = self.frame.transpose()
        # Group low abundant into 'others' #
        low_abundance = self.frame.sum() < 30000
        other_count = self.frame.loc[:, low_abundance].sum(axis=1)
        self.frame = self.frame.loc[:, ~low_abundance]
        self.frame['Others'] = other_count
        # Normalize #
        self.frame = self.frame.apply(lambda x: 100*x/x.sum(), axis=1)
        # Sort the table by sum #
        sums = self.frame.sum()
        sums.sort(ascending=False)
        self.frame = self.frame.reindex_axis(sums.keys(), axis=1)
        # Plot #
        fig = pyplot.figure()
        axes = self.frame.plot(kind='bar', stacked=True, color=cool_colors)
        fig = pyplot.gcf()
        # Other #
        axes.set_ylabel('Relative abundances in percent')
        axes.xaxis.grid(False)
        axes.yaxis.grid(False)
        axes.set_ylim([0,100])
        # Put a legend below current axis
        axes.legend(loc='upper center', bbox_to_anchor=(0.5, -0.40), fancybox=True, shadow=True, ncol=5, prop={'size':10})
        # Font size #
        axes.tick_params(axis='x', which='major', labelsize=11)
        # Save it #
        self.save_plot(fig, axes)
        self.frame.to_csv(self.csv_path)
        pyplot.close(fig)
コード例 #31
0
                      engine='c',
                      low_memory=True)

for i in range(len(az_data["MSN"])):
    if az_data["Year"][i] > 2009 and az_data["MSN"][i] == "TETCB":
        az_new_msn.append(az_data["MSN"][i])
        az_new_year.append(az_data["Year"][i])
        az_new_data.append(az_data["Data"][i])
    else:
        pass
az_new["MSN"] = az_new_msn
az_new["Year"] = az_new_year
az_new["Data"] = az_new_data
az_new = pd.DataFrame(az_new)
az_new.to_csv("data/csv/state_data/az_new_data.csv",
              index=False,
              index_label=False,
              sep=',')

for i in range(len(ca_data["MSN"])):
    if ca_data["Year"][i] > 2009 and ca_data["MSN"][i] == "TETCB":
        ca_new_msn.append(ca_data["MSN"][i])
        ca_new_year.append(ca_data["Year"][i])
        ca_new_data.append(ca_data["Data"][i])
    else:
        pass

ca_new["MSN"] = ca_new_msn
ca_new["Year"] = ca_new_year
ca_new["Data"] = ca_new_data
ca_new = pd.DataFrame(ca_new)
ca_new.to_csv("data/csv/state_data/ca_new_data.csv",
コード例 #32
0
                x_df.set_value(cur_index, 'mean disk space',
                               sum(mean_disk_space))
            else:
                x_df.set_value(cur_index, 'max disk space', 0.0)
                x_df.set_value(cur_index, 'mean disk space', 0.0)
            mean_disk_space = [usage_df['mean disk space'][
                i]] if usage_df['mean disk space'][i] > 0 else []
            cur_start = int(usage_df['start time'][i] / interval)
            cur_index = cur_index + 1
        else:
            mean_disk_space = [usage_df['mean disk space'][
                i]] if usage_df['mean disk space'][i] > 0 else []
            cur_start = int(usage_df['start time'][i] / interval)
            cur_index = cur_index + 1

    x_df.to_csv(out_file, index=False)


# Get X Labels (Features)

import pandas as pd
IDs = pd.read_csv('./machine_id.csv')
machine_IDs = IDs['machine ID'].tolist()

output_file = './machine_label_X-500.csv'
start_pos = [0 for i in range(500)]

columns = ['timestamp', 'machine ID', 'max CPU usage', 'mean CPU usage', 'max disk I/O', 'mean disk I/O', 'max disk space',
           'mean disk space', 'max memory usage', 'mean memory usage', 'max page cache', 'mean page cache', 'max MAI', 'mean MAI']
out_df = pd.DataFrame(columns=columns)
コード例 #33
0
def main():

    opts = argparse.ArgumentParser()

    opts.add_argument('--excelSheetIn', dest='excelSheetIn')

    opts.add_argument('--barcodeFasta', default=None, dest='barcodeFasta')

    opts.add_argument('--fxnTransformBarcode_i7',
                      default="lambda barcin:barcin",
                      dest='fxnTransformBarcode_i7')
    opts.add_argument('--fxnTransformBarcode_i5',
                      default="lambda barcin:barcin",
                      dest='fxnTransformBarcode_i5')

    opts.add_argument('--coreTemplateIn', dest='coreTemplateIn')
    opts.add_argument('--coreSheetOut', default=None, dest='coreSheetOut')
    opts.add_argument('--coreBuffer', default=None, dest='coreBuffer')
    opts.add_argument('--coreConc', default=None, dest='coreConc')
    opts.add_argument('--coreVol', default=None, dest='coreVol')
    opts.add_argument('--coreFraglen', default=None, dest='coreFraglen')
    opts.add_argument('--coreShortcode', default=None, dest='coreShortcode')
    opts.add_argument('--coreSpecies', default=None, dest='coreSpecies')
    opts.add_argument('--coreNotes', default=None, dest='coreNotes')

    opts.add_argument('--libKeyOut', dest='libKeyOut')

    opts.add_argument('--extraCols', default=None, dest='extraCols')

    opts.add_argument(
        '--fxnLibName',
        default='lambda r:"%s_%s"%(r.source_plate,r.source_well)',
        dest='fxnLibName')

    o = opts.parse_args()

    fxnLibName = eval(o.fxnLibName)

    fxnTransformBarcode_i5 = eval(o.fxnTransformBarcode_i5)
    fxnTransformBarcode_i7 = eval(o.fxnTransformBarcode_i7)

    # load the barcode sequences
    filInBarcs = open(o.barcodeFasta, 'r')
    l = filInBarcs.readline()
    # if l[0]=='>':
    assert l[0] == '>'
    mBcNameSeq_i5i7 = {}
    while len(l) > 0:
        bcname = l[1:].rstrip()
        bcseq = filInBarcs.readline().rstrip()
        l = filInBarcs.readline()
        assert bcname not in mBcNameSeq_i5i7, '%s present > once' % bcname
        mBcNameSeq_i5i7[bcname] = (fxnTransformBarcode_i5(bcseq),
                                   fxnTransformBarcode_i7(bcseq))

    wbin = openpyxl.load_workbook(filename=o.excelSheetIn)

    sheet_src_plate = wbin.get_sheet_by_name('SOURCE PLATE')
    sheet_src_well = wbin.get_sheet_by_name('SOURCE WELL')
    sheet_src_barcp5 = wbin.get_sheet_by_name('P5 BARCODE')
    sheet_src_barcp7 = wbin.get_sheet_by_name('P7 BARCODE')

    # source plate:
    # make sure 1...12 from A6 to right
    for i in range(1, 12):
        sheetloc = ofsFrom('A6', right=i)
        obs = str(int(sheet_src_plate[sheetloc].value))
        exp = str(i)
        assert obs == exp, 'SOURCE PLATE %s : expected %s but got %s' % (
            sheetloc, exp, obs)
    # make sure A..H giong from A7 down
    for i in range(0, 8):
        sheetloc = ofsFrom('A7', down=i)
        obs = str(sheet_src_plate[sheetloc].value)
        exp = str(chr(ord('A') + i))
        assert obs == exp, 'SOURCE PLATE %s : expected %s but got %s' % (
            sheetloc, exp, obs)

    # well plate:
    # make sure 1...12 from A6 to right
    for i in range(1, 12):
        sheetloc = ofsFrom('A6', right=i)
        obs = str(int(sheet_src_well[sheetloc].value))
        exp = str(i)
        assert obs == exp, 'SOURCE WELL %s : expected %s but got %s' % (
            sheetloc, exp, obs)
    # make sure A..H giong from A7 down
    for i in range(0, 8):
        sheetloc = ofsFrom('A7', down=i)
        obs = str(sheet_src_well[sheetloc].value)
        exp = str(chr(ord('A') + i))
        assert obs == exp, 'SOURCE WELL %s : expected %s but got %s' % (
            sheetloc, exp, obs)

    # p7 barc:
    # make sure 1...12 from A7 to right
    for i in range(1, 12):
        sheetloc = ofsFrom('A7', right=i)
        obs = str(int(sheet_src_barcp7[sheetloc].value))
        exp = str(i)
        assert obs == exp, 'P7 PRIMER %s : expected %s but got %s' % (sheetloc,
                                                                      exp, obs)
    # make sure A..H giong from A8 down
    for i in range(0, 8):
        sheetloc = ofsFrom('A8', down=i)
        obs = str(sheet_src_barcp7[sheetloc].value)
        exp = str(chr(ord('A') + i))
        assert obs == exp, 'P7 PRIMER %s : expected %s but got %s' % (sheetloc,
                                                                      exp, obs)

    # p5 barc:
    # make sure 1...12 from A7 to right
    for i in range(1, 12):
        sheetloc = ofsFrom('A7', right=i)
        obs = str(int(sheet_src_barcp5[sheetloc].value))
        exp = str(i)
        assert obs == exp, 'p5 PRIMER %s : expected %s but got %s' % (sheetloc,
                                                                      exp, obs)
    # make sure A..H giong from A8 down
    for i in range(0, 8):
        sheetloc = ofsFrom('A8', down=i)
        obs = str(sheet_src_barcp5[sheetloc].value)
        exp = str(chr(ord('A') + i))
        assert obs == exp, 'p5 PRIMER %s : expected %s but got %s' % (sheetloc,
                                                                      exp, obs)

    # gather into DF
    df = OrderedDict()
    for col in [
            'well', 'source_plate', 'source_well', 'p7_barc_and_well',
            'p5_barc_and_well', 'p7_barc', 'p5_barc', 'p7_barc_seq',
            'p5_barc_seq'
    ]:
        df[col] = []

    for j in range(0, 12):
        for i in range(0, 8):
            sheetloc1 = ofsFrom('B7', down=i, right=j)
            sheetloc2 = ofsFrom('B8', down=i, right=j)
            # fun fact- excel does rows/cols opposite of PCR plates
            well = ofsFrom('A1', down=j, right=i)
            srcplate = sheet_src_plate[sheetloc1].value
            srcwell = sheet_src_well[sheetloc1].value
            curp5 = sheet_src_barcp5[sheetloc2].value
            curp7 = sheet_src_barcp7[sheetloc2].value

            srcplate = '' if srcplate is None else str(srcplate).strip()
            srcwell = '' if srcwell is None else str(srcwell).strip()
            curp5 = '' if curp5 is None else str(curp5).strip()
            curp7 = '' if curp7 is None else str(curp7).strip()

            # srcplate=srcplate.replace('-','').replace('_','')
            srcplate = re.subn('[\'"$:\W@\n]', '', srcplate)[0]
            srcwell = re.subn('[\'"$:\W@\n]', '', srcwell)[0]

            if any([
                    len(srcplate) == 0,
                    len(srcwell) == 0,
                    len(curp5) == 0,
                    len(curp7) == 0
            ]):
                if not all([
                        len(srcplate) == 0,
                        len(srcwell) == 0,
                        len(curp5) == 0,
                        len(curp7) == 0
                ]):
                    print(
                        'WARNING: well %s is not empty in all sheets: %s %s %s %s'
                        % (well, srcplate, srcwell, curp5, curp7))
            else:

                assert ':' in curp7, 'ERROR well %s p7 invalid barcode %s' % (
                    well, curp7)
                assert ':' in curp5, 'ERROR well %s p5 invalid barcode %s' % (
                    well, curp5)

                df['well'].append(well)
                df['source_plate'].append(srcplate)
                df['source_well'].append(srcwell)
                df['p7_barc_and_well'].append(curp7)
                df['p5_barc_and_well'].append(curp5)
                df['p7_barc'].append(curp7.split(':')[1])
                df['p5_barc'].append(curp5.split(':')[1])
                df['p7_barc_seq'].append(
                    mBcNameSeq_i5i7[curp7.split(':')[1]][1])
                df['p5_barc_seq'].append(
                    mBcNameSeq_i5i7[curp5.split(':')[1]][0])

    # gather extra cols if any
    mKvExtra = {}
    if o.extraCols is not None:
        for kv in o.extraCols.split(','):
            mKvExtra[kv.split(':')[0]] = kv.split(':')[1]

    df = pd.DataFrame(df)
    for k in mKvExtra:
        df[k] = mKvExtra[k]

    df['libname'] = ''
    for i in df.index:
        df.ix[i, 'libname'] = fxnLibName(df.ix[i])

    # save to our own key
    df.to_csv(o.libKeyOut, sep='\t', index=False)

    # save into core template
    if o.coreSheetOut is not None:
        shutil.copyfile(o.coreTemplateIn, o.coreSheetOut)
        ct = openpyxl.load_workbook(filename=o.coreSheetOut)

        wb = ct.active

        # wb['A13']='Sample Name*'

        rowofs = 0
        for _, r in df.iterrows():
            shloco = ofsFrom('A18', down=rowofs, right=0)
            wb[shloco] = r.libname

            # shloco=ofsFrom('A18',down=rowofs,right=1)
            # wb[ shloco ] = o.coreBuffer

            shloco = ofsFrom('A18', down=rowofs, right=2)
            wb[shloco] = float(o.coreConc)

            shloco = ofsFrom('A18', down=rowofs, right=3)
            wb[shloco] = float(o.coreVol)

            # shloco=ofsFrom('A18',down=rowofs,right=4)
            # wb[ shloco ] = float(o.coreFraglen)

            shloco = ofsFrom('A18', down=rowofs, right=5)
            bcseq5 = mBcNameSeq_i5i7[r.p5_barc][0]
            bcseq7 = mBcNameSeq_i5i7[r.p7_barc][1]
            wb[shloco] = bcseq7

            shloco = ofsFrom('A18', down=rowofs, right=4)
            wb[shloco] = bcseq5

            # shloco=ofsFrom('A18',down=rowofs,right=6)
            # wb[ shloco ] = o.coreShortcode

            shloco = ofsFrom('A18', down=rowofs, right=5)
            wb[shloco] = o.coreSpecies

            shloco = ofsFrom('A18', down=rowofs, right=6)
            wb[shloco] = "DNA"

            if rowofs == 0:
                shloco = ofsFrom('A18', down=rowofs, right=7)
                wb[shloco] = o.coreNotes

            rowofs += 1

        ct.save(filename=o.coreSheetOut)
コード例 #34
0
problem = {
        "num_vars" : len(par_morris),
        "names" : par_morris,
        "groups" : None,
        "bounds" : [[0, lev-1]] * len(par_morris)
        }

param_values = sample(problem, N=n_traj, grid_jump=grid_jump, num_levels=lev,
                      sample4uniformity = 1000).astype(np.int64)

#%%Plot
#fig = plt.figure(figsize=(8, 6))
#sample_histograms(fig, param_values, problem, {'color': 'y'})
#plt.tight_layout()
#plt.savefig(fig_out_path, dpi=100)

#%%Create Dataframes
traj_real = pd.DataFrame(OrderedDict([(par, pars[par][param_values[:, i]]) for i, par in enumerate(par_morris)]))
traj_id  = pd.DataFrame(OrderedDict([(par, param_values[:, i]) for i, par in enumerate(par_morris)]))
fixed_pars = pd.DataFrame(fixed_pars,  index=["fix"])

#Generate 2D linspace with for each simulation all levels
n_aqtds_all = (np.linspace(0,1,num=lev)*aqtds_depth(traj_real["H_b"])[:, None])
n_aqtd_select = n_aqtds_all[np.arange(traj_id["N_aqt"].shape[0]), traj_id["N_aqt"].values].astype(np.int64)
traj_real["N_aqt"] = n_aqtd_select

#%%Save as csv
traj_real.to_csv(traj_real_path)
traj_id.to_csv(traj_id_path)
fixed_pars.to_csv(fixed_pars_path)