Example #1
0
def average_series(series_obj: pd.Series) -> pd.Series:
    """
    For the object, inspect its keys for
    min/max entries and add them to a set.
    For each item in that set, average the
    min/max entries and return a dataframe
    row with this information.
    """
    labels = series_obj.keys()
    data = []
    axes = []
    values = {}
    for key in labels:
        m = re.search("((?<=min)|(?<=spc)|(?<=max))[A-Za-z]*", key)
        if m and m.group(0):
            if m.group(0) in values and len(values[m.group(0)]) == 2:
                total = (sum(values[m.group(0)]) + series_obj.get(key)) / 3
                if isinstance(values[m.group(0)][0], np.int64):
                    total = int(total)
                data.append(total)
                axes.append(m.group(0))
                del values[m.group(0)]
            elif m.group(0) in values:
                values[m.group(0)].append(series_obj.get(key))
            else:
                values[m.group(0)] = [series_obj.get(key)]
        else:
            data.append(series_obj.get(key))
            axes.append(key)
    return pd.Series(data, axes)
Example #2
0
def export_heat_file_combined(site_type, score_by='p', mode='max'):

    analyses = [
        active_driver.pan_cancer_analysis, active_driver.clinvar_analysis
    ]
    scores = []

    for analysis in analyses:
        result = analysis(site_type)
        df = result['all_gene_based_fdr']
        df[score_by] = -np.log10(df[score_by])
        df = df.set_index('gene')[score_by]
        scores.append(df)

    cancer = scores[0]
    clinvar = scores[1]

    if mode == 'product':
        combined_scores = cancer * clinvar
        combined_scores = combined_scores.fillna(0)
    elif mode == 'max':
        all_genes = set(list(cancer.index) + list(clinvar.index))
        cancer = Series({gene: cancer.get(gene, 0) for gene in all_genes})
        clinvar = Series({gene: clinvar.get(gene, 0) for gene in all_genes})
        combined_scores = cancer.where(cancer > clinvar,
                                       clinvar).fillna(cancer)
    else:
        raise ValueError(f'Wrong mode: {mode}')

    combined_scores.to_csv(f'hotnet_input_heat_combined_{mode}_{score_by}',
                           sep='\t',
                           header=False)
Example #3
0
def test_get():
    # GH 6383
    s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45,
                         51, 39, 55, 43, 54, 52, 51, 54]))

    result = s.get(25, 0)
    expected = 0
    assert result == expected

    s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56,
                         45, 51, 39, 55, 43, 54, 52, 51, 54]),
               index=pd.Float64Index(
                   [25.0, 36.0, 49.0, 64.0, 81.0, 100.0,
                    121.0, 144.0, 169.0, 196.0, 1225.0,
                    1296.0, 1369.0, 1444.0, 1521.0, 1600.0,
                    1681.0, 1764.0, 1849.0, 1936.0],
                   dtype='object'))

    result = s.get(25, 0)
    expected = 43
    assert result == expected

    # GH 7407
    # with a boolean accessor
    df = pd.DataFrame({'i': [0] * 3, 'b': [False] * 3})
    vc = df.i.value_counts()
    result = vc.get(99, default='Missing')
    assert result == 'Missing'

    vc = df.b.value_counts()
    result = vc.get(False, default='Missing')
    assert result == 3

    result = vc.get(True, default='Missing')
    assert result == 'Missing'
Example #4
0
    def _create_image_from_row(row: pd.Series) -> Image:
        r"""

        Parameters
        ----------
        row

        Returns
        -------

        """
        path = row[DatasetKeys.PATH.value]
        labels = row.get(DatasetKeys.LABELS.value)
        boxes = row.get(DatasetKeys.BOXES.value)
        boxes_labels = row.get(DatasetKeys.BOXES_LABELS.value)

        if boxes is not None:
            box_list = [
                Box(ijhw=ijhw, labels=labels)
                for ijhw, labels in zip(boxes, boxes_labels)
                if None not in ijhw and all([c > 0 for c in ijhw[:2]])
            ]
        else:
            box_list = None

        return Image.from_path(path=path,
                               labels=labels,
                               box_collection=box_list)
Example #5
0
def test_get():
    # GH 6383
    s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45,
                         51, 39, 55, 43, 54, 52, 51, 54]))

    result = s.get(25, 0)
    expected = 0
    assert result == expected

    s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56,
                         45, 51, 39, 55, 43, 54, 52, 51, 54]),
               index=pd.Float64Index(
                   [25.0, 36.0, 49.0, 64.0, 81.0, 100.0,
                    121.0, 144.0, 169.0, 196.0, 1225.0,
                    1296.0, 1369.0, 1444.0, 1521.0, 1600.0,
                    1681.0, 1764.0, 1849.0, 1936.0],
                   dtype='object'))

    result = s.get(25, 0)
    expected = 43
    assert result == expected

    # GH 7407
    # with a boolean accessor
    df = pd.DataFrame({'i': [0] * 3, 'b': [False] * 3})
    vc = df.i.value_counts()
    result = vc.get(99, default='Missing')
    assert result == 'Missing'

    vc = df.b.value_counts()
    result = vc.get(False, default='Missing')
    assert result == 3

    result = vc.get(True, default='Missing')
    assert result == 'Missing'
def drift(x: Series, h: int) -> np.ndarray:
    # x : time serie data
    # h : number of future predictions
    # equation : Ŷt+h|t = Yt + h * ((Yt - Y1) / (t - 1))
    diffRate = (x.get(x.last_valid_index()) - x.get(x.first_valid_index())) / (len(x.values) - 1)
    result = []
    for t in range(h):
        result.append(x.get(x.last_valid_index()) + ((t + 1) * diffRate))
    return Series(np.array(result))
Example #7
0
 def _check_and_review_practice_from_git(row: Series) -> int:
     get_file = partial(fn_get_file_info, row.get("repo_site"),
                        row.get("repo_user"), row.get("repo_name"))
     get_commit_list = partial(fn_get_commit_list_of_a_file,
                               row.get("repo_site"), row.get("repo_user"),
                               row.get("repo_name"))
     get_practice_list = partial(get_practice_file_from_git, get_file,
                                 get_commit_list, get_file_content)
     return _check_and_review_practice(get_practice_list, row, practice)
Example #8
0
def validate(row: pd.Series):
    """
    入力データに対する、お気持ち程度のバリデーションチェック
    """
    official_page = row["official_page"]
    if official_page and checkers.is_url(official_page) == False:
        raise ValidationWarning("公式URL(officical_page)が不正です")
    detail_page = row["detail_page"]
    if detail_page and checkers.is_url(detail_page) == False:
        raise ValidationWarning("詳細ページ(detail_page)のURLが不正です")

    # 郵便番号、電話番号の書式のバリデーション(厳密ではない)
    remove_char_regex = r"[ -‐-‑ー−‒–—―ー ]"  # (区切り文字適当)
    tel = re.sub(remove_char_regex, "", row["tel"])
    if tel and not re.match(r"^0\d{9,10}$", tel):
        raise ValidationWarning("電話番号(tel)の書式が不正です")  # 0始まりの半角数字9〜10桁
    zip_code = re.sub(remove_char_regex, "", row["zip_code"])
    if zip_code and not re.match(r"\d{7}$", zip_code):
        raise ValidationWarning("郵便番号(zip_code)の書式が不正です")  # 半角数字7桁

    # HTMLタグが含まれてほしくないやつに含まれている
    for target in [
            "shop_name",
            "address",
            "official_page",
            "detail_page",
            "opening_hours",
            "closing_day",
            "area_name",
    ]:
        text = row.get(target)
        if not text:
            continue
        if len(text) != len(w3lib.html.remove_tags(text)):
            raise ValidationWarning(f"{target}にHTMLタグが含まれています")

    # 郵便番号でのジオコーディング結果に対する正当性チェック
    try:
        zip_code = row["zip_code"]
        if not zip_code:
            return
        pref = cached_posuto_pref(zip_code)
    except KeyError:
        # MEMO: posutoのデータには存在しない(特殊な)郵便番号が指定されている場合がある
        # いわゆる「大口事業所個別番号」というやつで、そういうのはどうしようもないのでバリデーション成功とする
        logger.info(f"不明な郵便番号です (「大口事業所個別番号」かも?) : zip code={zip_code}")
        return
    except Exception as e:
        # MEMO: その他特殊すぎる郵便番号などでposuto内部でエラーが起きた場合
        logger.warning(e, stack_info=True)
        logger.warning(f"unknown posuto error, zip code={zip_code}")
        raise ValidationWarning(f"posutoでエラーになる郵便番号です(内部処理エラー)")

    norm_addr = row.get("normalized_address")
    if norm_addr and not norm_addr.startswith(pref):
        raise ValidationWarning(
            f"郵便番号から求められた都道府県は {pref} ですが、ジオコーディングされた住所は {norm_addr} です")
Example #9
0
 def getObjSense(self, problemname : str, x : pd.Series):
     """get the objective sense of a problem
     """
     if problemname in self.objsensedict:
         return self.objsensedict[problemname]
     elif not pd.isnull(x.get(Key.ObjectiveSense, None)):
         return x.get(Key.ObjectiveSense)
     else:
         logger.warning("No objective sense for {}, assuming minimization".format(problemname))
         return ObjectiveSenseCode.MINIMIZE
Example #10
0
def get_key_split(series: pd.Series, ):
    '''
    生成新列的格式 前缀+存储类型
    '''
    key = series.get('Key', '')
    storage_class = series.get('StorageClass', '')
    prefix_key = '/'.join(key.split('/')[:-1])
    prefix_key = '/' if prefix_key == '' else f'/{prefix_key}/'
    result = f'{prefix_key}+{storage_class}'
    return result
Example #11
0
def get_metadata(row: pd.Series, columns: dict, label: str):
    metadata = dict()

    if columns["metadata_column"]:
        for column in columns["metadata_column"]:
            if pd.notnull(row.get(column)):
                metadata[sanitize(column)] = row.get(column)

        logger.debug(f"Got metadata: '{metadata}' for '{label}'")

    return metadata
Example #12
0
def get_combined_text(row: pd.Series):
    categories_string = get_category_string(row)
    values = [
        row.get("title", ""),
        row.get("subtitle", ""),
        row.get("shortDescription", ""),
        # row.get("description", ""), Don't use this to save on compute load
        categories_string,
    ]
    result = " ".join(str(x) for x in values)
    return result
Example #13
0
def get_additional_column(row: pd.Series, columns: dict, label: str):
    additional_column = dict()

    if columns["additional_column"]:
        for column in columns["additional_column"]:
            if pd.notnull(row.get(column)):
                additional_column[sanitize(column)] = row.get(column)

        logger.debug(
            f"Got additional_column: '{additional_column}' for '{label}'")

    return additional_column
Example #14
0
def _gen_messages(
    row: Series,
    col_names: List[str],
    messages: List[Message],
    importance_coefficient: float = 1.0,
    polarity: float = 0.0,
) -> None:
    location = row["location"]
    location_type = row["location_type"]
    timestamp_type = row["timestamp_type"]
    agent = row["agent"]
    agent_type = row["agent_type"]
    timestamp = row["timestamp"]

    if isinstance(timestamp, float):
        timestamp = str(int(timestamp))

    if not timestamp.startswith("202"):
        return

    for col_name in col_names:
        value_type = col_name
        value = row[col_name]

        outlierness_col_name = col_name + ":outlierness"
        outlierness = row.get(outlierness_col_name, None)

        if not outlierness:
            outlierness = row.get(col_name + ":grouped_by_time:outlierness",
                                  None)

        if value is None or value == "" or (isinstance(value, float)
                                            and isnan(value)):
            # 'value' is effectively undefined, do not REALLY generate the message.
            continue

        fact = Fact(
            location="[ENTITY:{}:{}]".format(location_type, location),
            location_type=location_type,
            value=value,
            value_type=value_type,
            timestamp=timestamp,
            timestamp_type=timestamp_type,
            agent=agent,
            agent_type=agent_type,
            outlierness=outlierness,
        )

        message = Message(facts=fact,
                          importance_coefficient=importance_coefficient,
                          polarity=polarity)
        messages.append(message)
Example #15
0
def get_locations(row: pd.Series, columns: dict, label: str):
    location = None

    if columns["latitude_column"] and columns["longitude_column"]:
        latitude = row.get(columns["latitude_column"])
        longitude = row.get(columns["longitude_column"])

        if pd.notna(latitude) and pd.notna(longitude):
            location = (longitude, latitude)

        logger.debug(f"Got location '{location}' for '{label}'")

    return location
Example #16
0
 def from_series(cls, series: pd.Series, name: str, sn: int = 1):
     print('Проверьте правильность данных в таблице\n')
     try:
         return cls(qm_program=series.get('qm_program', 'gaussian'),
                    linear=series.get('linear', False),
                    atom=series.get('atom', False),
                    qm_data=series,
                    name=name,
                    sn=sn)
     except Exception as err:
         # TODO
         _msg = f'initialization error in {name}'
         # logger.error(_msg)
         raise err
Example #17
0
 def mapToGenre(x: pd.Series):
     """
     :param x:pd.Series
     :return:(int,list)
     """
     # print(f"{type(x)} {x}")
     movieId = x.get("movieId")
     val = x.get("genres")
     if val == "(no genres listed)":
         genres['no-genres-listed'].append(movieId)
         return
     genre_list = [g.lower() for g in val.split("|")]
     # print(genre_list)
     list(map(lambda g: genres[g].append(movieId), genre_list))
Example #18
0
        def determine_status_level(html_row_series: pd.Series):
            """
            Evaluate presence of data in html table and return string based on business logic tree.

            This is reproduced functionality from interpretation of single line statement in old code
            that determined the 'Status' value in database table
            OLD PYTHON STATEMENT:
                "red" if row[4] is not '' else "yellow" if row[3] is not '' or row[6] is not ''
                else "t_bypass" if row[7] is not '' else "mini" if row[5] is not '' else "normal"
            'row' was a record from an html table, with two values appended at the beginning. The old process was
            basically looking for a value other than null/empty, and there is a hierarchy of importance if values
            are simultaneously present. The old way created a list called 'row' that started with created date and
            current date, then the row contents from html table. Redesign subtracts two from old index positions
            since the two date values are no longer a factor.

            :param html_row_series: pandas series containing data from a row of html table
            :return:
            """
            # Get the values in the table or a default of numpy NaN
            yellow_alert_ser_val = html_row_series.get(key="Yellow Alert",
                                                       default=np.NaN)
            red_alert_ser_val = html_row_series.get(key="Red Alert",
                                                    default=np.NaN)
            mini_disaster_ser_val = html_row_series.get(key="Mini Disaster",
                                                        default=np.NaN)
            reroute_ser_val = html_row_series.get(key="ReRoute",
                                                  default=np.NaN)
            trauma_bypass_ser_val = html_row_series.get(key="Trauma ByPass",
                                                        default=np.NaN)

            # check for presence of any non-null, value in order of business importance level, and return result
            if pd.notnull(red_alert_ser_val):
                # Red alerts are top priority
                return "red"
            else:
                if pd.notnull(yellow_alert_ser_val) or pd.notnull(
                        reroute_ser_val):
                    # Yellow or ReRoute take second priority
                    return "yellow"
                else:
                    if pd.notnull(trauma_bypass_ser_val):
                        # Trauma ByPass is third
                        return "t_bypass"
                    else:
                        if pd.notnull(mini_disaster_ser_val):
                            # Mini Disaster is fourth
                            return "mini"
                        else:
                            return "normal"
Example #19
0
def test_get_with_default():
    # GH#7725
    d0 = ["a", "b", "c", "d"]
    d1 = np.arange(4, dtype="int64")
    others = ["e", 10]

    for data, index in ((d0, d1), (d1, d0)):
        s = Series(data, index=index)
        for i, d in zip(index, data):
            assert s.get(i) == d
            assert s.get(i, d) == d
            assert s.get(i, "z") == d
            for other in others:
                assert s.get(other, "z") == "z"
                assert s.get(other, other) == other
Example #20
0
def _annualized_return(
        levels: pd.Series,
        rolling: Union[int, pd.DateOffset],
        interpolation_method: Interpolate = Interpolate.NAN) -> pd.Series:
    if isinstance(rolling, pd.DateOffset):
        starting = [tstamp - rolling for tstamp in levels.index]
        levels = interpolate(levels, method=interpolation_method)
        points = list(
            map(
                lambda d, v, i: pow(v / levels.get(i, np.nan), 365.25 /
                                    (d - i).days) - 1, levels.index[1:],
                levels.values[1:], starting[1:]))
    else:
        if interpolation_method is not Interpolate.NAN:
            raise MqValueError(
                f'If w is not a relative date, method must be nan. You specified method: '
                f'{interpolation_method.value}.')
        starting = [0] * rolling
        starting.extend([a for a in range(1, len(levels) - rolling + 1)])
        points = list(
            map(
                lambda d, v, i: pow(v / levels[i], 365.25 /
                                    (d - levels.index[i]).days) - 1,
                levels.index[1:], levels.values[1:], starting[1:]))
    points.insert(0, 0)
    return pd.Series(points, index=levels.index)
Example #21
0
    def test_get_default(self):

        # GH 7725
        d0 = "a", "b", "c", "d"
        d1 = np.arange(4, dtype='int64')
        others = "e", 10

        for data, index in ((d0, d1), (d1, d0)):
            s = Series(data, index=index)
            for i, d in zip(index, data):
                assert s.get(i) == d
                assert s.get(i, d) == d
                assert s.get(i, "z") == d
                for other in others:
                    assert s.get(other, "z") == "z"
                    assert s.get(other, other) == other
Example #22
0
def collect_person_data(person: pd.Series, people: pd.DataFrame = None):
    if people is None:
        people = pd.DataFrame({},
                              columns=[
                                  'givenName',
                                  'middleName',
                                  'familyName',
                                  'addressLocality',
                                  'addressRegion',
                                  'checkRelatives',
                                  'none_relatives',
                              ])

    print(
        f'== {person.get("givenName", "___")} {person.get("familyName", "___")} =='
    )

    for collector in COLLECTORS:
        with collector(person) as c:
            c.validate_data()
            person = c.person.copy(deep=True)
            relatives = c.check_relatives(people)
            if relatives is False:
                continue
            people = people.append(relatives, ignore_index=True)

    if person.get('name', '') in people.index:
        people = people.drop(person.name).append(
            person, ignore_index=False).sort_index()
        # people.loc[person.name] = person

    return person, people
def create_instance(
    cls: Type[DeclaredModel],
    row: pd.Series,
    *,
    null_values: List[str] = None,
    ignore_cols: List[str] = None,
) -> DeclaredModel:
    if null_values is None:
        null_values = []
    if ignore_cols is None:
        ignore_cols = []

    data = {}

    for col in cls.__table__.columns:
        if col.name in ignore_cols:
            continue

        val = row.get(col.name)
        assert val is not None
        if val in null_values:
            val = None
        else:
            val = get_converter(col.type)(val)
        data[col.name] = val
    return cls(**data)
Example #24
0
def _prepare_one_phenotype(C: NDArray[(Any, Any),
                                      Float], row: pd.Series, correction: str,
                           includes_intercept: bool) -> pd.Series:
    '''
    Creates the broadcasted information for one (phenotype, offset) pair. The returned series
    contains the information eventually stored in a LogRegState.

    This function accepts and returns a pandas series for integration with Pandas UDFs and
    pd.DataFrame.apply.
    '''
    y = row['values']
    mask = ~np.isnan(y)
    offset = row.get('offset')
    y_pred = _logistic_null_model_predictions(y, C, mask, offset)
    y_res = np.nan_to_num(y - y_pred)
    gamma = y_pred * (1 - y_pred)
    CtGammaC = C.T @ (gamma[:, None] * C)
    inv_CtGammaC = np.linalg.inv(CtGammaC)
    row.label = str(row.label)  # Ensure that the phenotype name is a string
    row.drop(['values', 'offset'], inplace=True, errors='ignore')
    row['y_res'], row['gamma'], row['inv_CtGammaC'] = np.ravel(
        y_res), np.ravel(gamma), np.ravel(inv_CtGammaC)
    if correction == correction_approx_firth:
        row['firth_offset'] = np.ravel(
            af.perform_null_firth_fit(y, C, mask, offset, includes_intercept))
    return row
Example #25
0
def get_exact_mass(row: pd.Series) -> Optional[float]:
    for column in ['PUBCHEM_EXACT_MASS', 'EXACT_MASS', 'EXACT MASS']:
        molecular_mass = row.get(column)
        molecular_mass = float(molecular_mass)
        if molecular_mass is not None and not np.isnan(molecular_mass):
            return molecular_mass
    return None
Example #26
0
 def safe_update(cls, data_row: pd.Series, obj: Any):
     """
     for field_name, (column_name, column_type) in cls:
         try:
             setattr(obj, field_name, data_row[column_name])
         except BaseException:
             logger.error("Could not set '{}' from '{}' on '{}'".format(
                 data_row.get(column_name, 'MISSING'),
                 column_name,
                 obj,
             ))
     """
     for i in cls:
         try:
             setattr(obj, i.name, data_row[i.value.column_name])
         except BaseException:
             logger.error("Could not set '{}' from '{}' on '{}'".format(
                 data_row.get(i.value.column_name, 'MISSING'),
                 i.value.column_name,
                 obj,
             ))
     try:
         obj.save()
     except BaseException:
         logger.exception("Could not save '{}'".format(obj))
Example #27
0
    def test_get_default(self):

        # GH 7725
        d0 = "a", "b", "c", "d"
        d1 = np.arange(4, dtype='int64')
        others = "e", 10

        for data, index in ((d0, d1), (d1, d0)):
            s = Series(data, index=index)
            for i, d in zip(index, data):
                assert s.get(i) == d
                assert s.get(i, d) == d
                assert s.get(i, "z") == d
                for other in others:
                    assert s.get(other, "z") == "z"
                    assert s.get(other, other) == other
Example #28
0
    def center(self, mass=False):
        """
        Computes 3D coordinates of the geometrical center or the center of mass
        of atoms in the selection.

        Given the atoms of a ring it will calculate the ring center.

        :param mass: calculate center of mass
        :type mass:  :py:bool

        :return:     coordinate representing center
        :rtype:      :numpy:ndarray
        """

        coords = self.coord

        if mass:
            elements = reference_data['element_data']
            atom_mass = Series(elements.atomicMass.values,
                               index=elements.symbol).to_dict()
            scale = numpy.array(
                [atom_mass.get(element, 12.0) for element in self['element']])
        else:
            scale = numpy.ones((len(coords), 1))

        scaled = coords * scale
        return numpy.mean(scaled, axis=0)
Example #29
0
def radius_gyration(topology, mass=True):
    """
    Compute the radius of gyration of the atom selection

    :param topology: atom selection to compute radius of gyration for
    :type topology:  :interact:TopologyDataFrame
    :param mass:     use atomic masses otherwise equal masses of 1
    :type mass:      :py:bool

    :return:         radius of gyration
    :rtype:          :py:float
    """

    coords = topology.coord

    if mass:
        elements = reference_data['element_data']
        atom_mass = Series(elements.atomicMass.values,
                           index=elements.symbol).to_dict()
        scale = numpy.array(
            [atom_mass.get(element, 12.0) for element in topology['element']])
    else:
        scale = numpy.ones((len(coords), 1))

    weights = scale / scale.sum()

    mu = coords.mean(1)
    centered = (coords.transpose((1, 0, 2)) - mu).transpose((1, 0, 2))
    squared_dists = (centered**2).sum(2)

    return (squared_dists * weights).sum(1)**0.5
Example #30
0
    def parseSearch(self, modVal, expr=None, force=False, debug=False, quiet=False):
        ts = timestat("Parsing Discogs Search ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet))
                        
        io = fileIO()

            
        ########################################################################################
        # Previous DB Data
        ########################################################################################
        if not fileUtil(self.disc.getDBModValFilename(modVal)).exists:
            tsDB = timestat("Creating New DB For ModVal={0}".format(modVal))
            dbdata = Series({})
            ts.stop()
        else:
            tsDB = timestat("Loading ModVal={0} DB Data".format(modVal))
            dbdata = self.disc.getDBModValData(modVal)
            tsDB.stop()
            
        
        ########################################################################################
        # Previous Media Data
        ########################################################################################
        previousMetadata = self.disc.getMetadataAlbumData(modVal)
        
        
        ########################################################################################
        # Artist Search Data (No Media)
        ########################################################################################
        tsDB = timestat("Loading Artist Search Data For ModVal={0}".format(modVal))
        artistSearchFilenames = self.getArtistRawFiles(datatype="search", expr=expr, force=True)
        artistSearchFilename = [x for x in artistSearchFilenames if fileUtil(x).basename == "artistData-{0}".format(modVal)]
        if len(artistSearchFilename) == 1:
            artistSearchData = io.get(artistSearchFilename[0])
        else:
            raise ValueError("Could not find Discogs API Artist Search Data")
        tsDB.stop()
        
        
        N = artistSearchData.shape[0]
        modValue = 5000 if N >= 50000 else 1000
        nSave = 0
        tsParse = timestat("Parsing {0} Searched For Discogs API Artists".format(N))
        Nnew = 0
        for i,(artistID,artistData) in enumerate(artistSearchData.iterrows()):
            if (i+1) % modValue == 0 or (i+1) == N:
                tsParse.update(n=i+1, N=N)
            if dbdata.get(artistID) is not None:
                continue
            artistAPIData = {"Artist": artistData, "Albums": previousMetadata.get(artistID, {})}
            dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)}))
            Nnew += 1
            
        if Nnew > 0:
            print("Saving [{0}/{1}] {2} Entries To {3}".format(len(dbdata), len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal)))
            self.disc.saveDBModValData(modVal=modVal, idata=dbdata)
        else:
            print("Not saving any of the new data")
                
        ts.stop()            
Example #31
0
def valid_row(row: Series, required_columns: List[str]) -> bool:
    def check_column_value(column_value: Any) -> bool:
        return column_value is not None and isinstance(column_value, str) and column_value != '' \
               and column_value == column_value

    return reduce(lambda x1, x2: x1 and x2,
                  [check_column_value(row.get(column_name, None))
                   for column_name in required_columns], True)
Example #32
0
    def validateSeries(self, x : pd.Series) -> str:
        """
        validate the results of a problem

        Parameters:
        ----------
        x : Series
            Data series that represents problem information parsed by a solver

        """
        # print("{x.ProblemName} {x.PrimalBound} {x.DualBound} {x.SolverStatus}".format(x=x))
        problemname = x.get(Key.ProblemName)

        sstatus = x.get(Key.SolverStatus)

        if not problemname:
            return ProblemStatusCodes.Unknown

        if pd.isnull(sstatus):
            return ProblemStatusCodes.FailAbort


        else:
            #
            # check feasibility
            #
            pb = x.get(Key.PrimalBound)
            if self.isSolInfeasible(x) or not (pd.isnull(pb) or isInf(pb) or self.isLE(x.get(Key.ObjectiveLimit, -1e20), pb) or self.isSolFeasible(x)):
                return ProblemStatusCodes.FailSolInfeasible

            #
            # check reference consistency
            #
            psc = self.isReferenceConsistent(x)

            if psc != ProblemStatusCodes.Ok:
                return psc

            #
            # report inconsistency among solvers.
            #
            elif self.isInconsistent(problemname):
                return ProblemStatusCodes.FailInconsistent

        return Key.solverToProblemStatusCode(sstatus)
def get_recommend(username, data=data, pearson_result=pearson_result):
    pearson_user = pearson_result[username].drop(username).index
    current_user_index = data[username].dropna().index
    result = Series()
    for user in pearson_user:
        no_index = data[user].dropna().index.difference(
            current_user_index)
        for i in no_index:
            rating = data[user][i] * pearson_result[username][user]
            if result.get(i, None) and (result[i] > rating):
                result[i] = rating
            else:
                result[i] = rating
    return result.sort_values(ascending=False)
a['a']       #元素
#%%  使用切片过滤系列
a[0:5:2]     #系列
#%%  使用索引切片(如果索引非数字其切片是闭区间)
a['b':'i':2]  #系列
#%%  使用布尔值系列进行过滤
a[(a>5)&(a<9)]   #系列
#%%  使用reindex函数
a.reindex(list(letters[5:15]))    #系列
#%%  使用get_value,set_vavlue访问
a.set_value('a',2)   # 元素
a.get_value('a')     # 元素
#%%  使用iget_value访问
a.iget_value(0)      # 元素
#%%  使用get访问
a.get('c')           # 元素
#%%  使用iget访问
a.iget(0)            # 元素
#%%  使用truncate
a.truncate(5,9)      # 系列
#%%  使用item访问      
a.item(9)            # 元素
a.itemset(9,100)     # 元素



#%%
a.idxmax()
#%%
a.idxmin()
Example #35
0
def statistics(request):
    """
    This function is called when the Statistics button is pressed by the user. It's purpose is to
    take the selected platforms as well as some statistical parameters and perform two
    statistical functions: a T-Test and an FDR analysis

    :param request:
    :return: a rendered HTML page.
    """
    cutoff_type = request.GET.get('cutoff_type')
    cutoff_value = float(request.GET.get('cutoff_value'))
    display_values = request.session.get('display_values', {})
    spps = request.GET.get('spps')
    spps = spps.split(',')
    combined_series = []
    display_profile = None
    for spp in spps:
        _, study, display_profile, platform = spp.split('|')
        profile = display_profile.replace('_', '-')
        sample_ids = geo_data.get_sample_ids(study, profile, platform)
        control_sample_ids = []
        diseased_sample_ids = []
        for sample_id in sample_ids:
            sample_attributes = geo_data.get_sample_attributes(study, profile, platform, sample_id)
            if sample_attributes['control']:
                control_sample_ids.append(sample_id)
            else:
                diseased_sample_ids.append(sample_id)

        genes = geo_data.get_all_gene_symbols(study, profile, platform)
        no_of_genes = len(genes)
        control_exprs = zeros((no_of_genes, len(control_sample_ids)))
        diseased_exprs = zeros((no_of_genes, len(diseased_sample_ids)))

        for (g_index, gene) in enumerate(genes):
            gene_exprs = zeros(len(control_sample_ids))
            for (s_index, sample_id) in enumerate(control_sample_ids):
                expr_value = geo_data.get_gene_expression_value(study, profile, platform, sample_id, gene)
                if expr_value == 'None':
                    continue
                gene_exprs[s_index] = expr_value
            control_exprs[g_index] = gene_exprs

            gene_exprs = zeros(len(diseased_sample_ids))
            for (s_index, sample_id) in enumerate(diseased_sample_ids):
                expr_value = geo_data.get_gene_expression_value(study, profile, platform, sample_id, gene)
                if expr_value == 'None':
                    continue
                gene_exprs[s_index] = expr_value
            diseased_exprs[g_index] = gene_exprs

        control_df = DataFrame(control_exprs, index=genes, columns=control_sample_ids)
        diseased_df = DataFrame(diseased_exprs, index=genes, columns=diseased_sample_ids)

        # Perform the the t-test and create a pandas Series
        t_statistics, p_values = ttest_ind(control_df.T, diseased_df.T)
        p_values_series = Series(p_values, index=genes)

        # Perform the fdr analysis, create a pandas Series and sort the series
        reject_fdr, pval_fdr = fdr_correction(p_values_series, method='indep')
        fdr_values_series = Series(pval_fdr, index=genes)
        p_values_series.sort(ascending=True)

        combined_series = []
        for i in range(len(p_values_series)):
            symbol = p_values_series.index[i]
            p_value = p_values_series[i]
            if cutoff_type == 'p-value' and p_value > cutoff_value:
                break
            fdr_value = fdr_values_series.get(symbol)
            if cutoff_type == 'fdr-value' and fdr_value > cutoff_value:
                break
            combined_series.append([symbol, p_value, fdr_value])

        display_values[display_profile] = combined_series

    request.session['display_values'] = display_values
    response = render_to_string('statistics.html',
                                {display_profile: combined_series})

    return HttpResponse(response)
Example #36
0
# Get frequency distribution on the given condition.
sent_fd = nltk.FreqDist(
            word.lower() for word in words
            if len(word) == length and
               check_condition(word, userinput)
        )               

# Display the top 3 frequent words if applicable.                
series = Series(sent_fd)
series.sort_values(ascending=False, inplace=True)
sumValues = series.sum()
top_words = series.keys()
count = len(top_words)
if count > 0:
    i = 0
    while i < count and i < 3:
        print(str(i + 1) + ': ' + top_words[i] + '  (' + 
              str(round(100 * series.get(i) / sumValues, 1)) + ' %)')
        i += 1
else:
    print("It doesn't seem like there is any word like that.")









from pandas import Series
from utils import util

dateList = [1, 3, 5, 6, 8]
db = {"No.1": "Wo", "No.2": "Shi", "No.3": "Ni", "No.4": "Da", "No.5": "Ye"}
# Creating a Series by passing a list of values, letting pandas create a default integer index
s = Series(dateList, index=["A", "B", "C", "D", "E"])  # 索引的长度必须和list的长度一样,否则为[0, ..., len(data) - 1]
util.report_tag("Series 处理list结构数据")
print "Series data structures is \n", s
print "index is ", s.index
print "values is ", s.values
print "the fist element is ", s[0]
print "0~3 element is \n", s[:3]
print ">3 element is \n", s[s > 3]
# print "通过 s[6] 会报错", s[6]
print "通过 s.get(6) return None", s.get(6)
print "查看前2行\n", s.head(2)
print "查看最后2行\n", s.tail(2)

util.report_tag("Series 处理dict数据")
s = Series(db)
print "Series data structures is \n", s
# Series更像是一个dict ,可以直接通过 s[index] 取值 ,判断是否存在index,假如通过s[index]取一个不存在的索引,将会报 KeyError,而用
# s.get(index) 不会报错 s.get(index,"default")这时取不到时可以用一个默认值代替
print "No.1 is ", s['No.1']
print "No.1 is exist", 'No.1' in s
print "No.1 & No.2 is \n", s[['No.1', 'No.2']]

util.report_tag("Series 过滤查询")
s = Series(dateList)
print "data > 5 is \n", s[s > 5]
Example #38
0
def test_get(arr):
    # GH 21260
    s = Series(arr, index=[2 * i for i in range(len(arr))])
    assert s.get(4) == s.iloc[2]

    result = s.get([4, 6])
    expected = s.iloc[[2, 3]]
    tm.assert_series_equal(result, expected)

    result = s.get(slice(2))
    expected = s.iloc[[0, 1]]
    tm.assert_series_equal(result, expected)

    assert s.get(-1) is None
    assert s.get(s.index.max() + 1) is None

    s = Series(arr[:6], index=list('abcdef'))
    assert s.get('c') == s.iloc[2]

    result = s.get(slice('b', 'd'))
    expected = s.iloc[[1, 2, 3]]
    tm.assert_series_equal(result, expected)

    result = s.get('Z')
    assert result is None

    assert s.get(4) == s.iloc[4]
    assert s.get(-1) == s.iloc[-1]
    assert s.get(len(s)) is None

    # GH 21257
    s = pd.Series(arr)
    s2 = s[::2]
    assert s2.get(1) is None