Example #1
0
class LocationTreeCache(object):
    def __init__(self):

        self.location_tree_columns = ["location_id", "parent_location_id"]
        self.location_tree_df = DataFrame(columns=self.location_tree_columns)

    def main(self):
        """
        Any and all parents attributed any and all children.  See the test
        case for an abstracted example.
        """

        ## now iterate from bottom to bottom to top ##
        location_type_loop_order = LocationType.objects.all().values_list("id", flat=True).order_by("-admin_level")

        for lt_id in location_type_loop_order:
            self.process_location_tree_lvl(lt_id)

        self.upsert_location_tree()

    def process_location_tree_lvl(self, location_type_id):

        lt_batch = []

        location_df = DataFrame(
            list(Location.objects.filter(location_type_id=location_type_id).values_list("id", "parent_location_id")),
            columns=self.location_tree_columns,
        )

        merged_df = location_df.merge(self.location_tree_df, left_on="location_id", right_on="parent_location_id")

        cleaned_merge_df = merged_df[["location_id_y", "parent_location_id_x"]]
        cleaned_merge_df.columns = self.location_tree_columns

        self.location_tree_df = concat([self.location_tree_df, location_df, cleaned_merge_df])

        self.location_tree_df.drop_duplicates()

    def upsert_location_tree(self):

        lt_batch = []

        ## only the ultimate parent should have itself as a parent ##
        ## drop all NA values, then create the ultimate parents ##
        self.location_tree_df.dropna(inplace=True)
        for loc in Location.objects.filter(parent_location_id__isnull=True):
            lt_batch.append(LocationTree(**{"location_id": loc.id, "parent_location_id": loc.id, "lvl": 0}))

        ## iterate through the location tree df created above ##
        for ix, loc in self.location_tree_df.iterrows():
            lt_batch.append(
                LocationTree(**{"location_id": loc.location_id, "parent_location_id": loc.parent_location_id, "lvl": 0})
            )

        LocationTree.objects.all().delete()
        LocationTree.objects.bulk_create(lt_batch)
def recursive_add_consumers(consumer_id, seen=set([])):
    if consumer_id is None:
        return

    seen.add(consumer_id)
    consumer_key = sample[sample.Consumer == consumer_id]
    IP = df.drop_duplicates(df(consumer_key.IP))

    n = np.array(np.arange(len(IP)))

    IP_Map = set([])
    for i in n:
        value = sample[sample.IP.isin([IP.iloc[i, 0]])]
        IP_Map.add(value)

    # print IP_Map

    print consumer_id
    print seen
    consumer_list = []

    # list of unique consumers that are linked to this one
    [consumer_list.extend(y.Consumer.iloc[l].tolist()) for l in [range(len(y.Consumer)) for y in IP_Map]]

    # print consumer_list
    # print [x for x in set(consumer_list).difference([consumer_id])]
    # unique_consumer_list = []
    # print [ x for x in set([y.Consumer.iloc[0] for y in IP_Map])]

    # tuples of ips and unique consumers attached to them
    print [(y.IP.iloc[0], set(y.Consumer.iloc[l].tolist())) for l in [range(len(y.Consumer)) for y in IP_Map]]
Example #3
0
 def getDateTimeSeries(self, instrument=None):
     if instrument is None:
         __dateTime = DataFrame()
         for element in self.__instrument:
             __dateTime = __dateTime.append(self.__feed[element].getPriceDataSeries().getDateTimes())
         __dateTime = __dateTime.drop_duplicates([0])
         return __dateTime.values  # 此时返回的为二维数组
     return self.__feed[instrument].getPriceDataSeries().getDateTimes()
def IP_Weight_Calc(consumer_id):

    if consumer_id == None:
        return
    consumer_key = sample[sample.Consumer == consumer_id]

    IP = df.drop_duplicates(df(consumer_key.IP))

    n = np.array(np.arange(len(IP)))
    IP_Weight_List = []

    for i in n:
        value = sample[sample.IP.isin([IP.iloc[i, 0]])]
        value2 = len(df.drop_duplicates(df(value.Consumer)))
        value3 = 1 / (value2 ** 2)
        IP_Weight_List.append(value3)

    return sum(IP_Weight_List)
Example #5
0
    def _compute_consistency(self):
        results = self.cursor.execute("SELECT did, type, entity FROM entities")
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=["did", "e_type", "entity"])
        df = df.drop_duplicates()
        tmp = df.groupby(["e_type", "entity"]).size().reset_index()
        tmp.rename(columns={0: "consistency"}, inplace=True)

        return tmp
Example #6
0
 def compute_ambiguity(self):
     results = self.cursor.execute("SELECT type, entity FROM entities")
     tmp = results.fetchall()
     freq_df = DataFrame(tmp, columns=["e_type", "entity"])
     freq_df["ambiguity"] = 10
     freq_df = freq_df.drop_duplicates()
     result_computed_location = self._compute_location_ambiguity(freq_df)
     result_computed_name = self._compute_name_ambiguity(result_computed_location)
     return result_computed_name
Example #7
0
def _get_data(idx_sym):
    """
    Returns DataFrame containing list of component information for
    index represented in idx_sym from yahoo. Includes component symbol
    (ticker), exchange, and name.

    Parameters
    ----------
    idx_sym : str
        Stock index symbol
        Examples:
        '^DJI' (Dow Jones Industrial Average)
        '^NYA' (NYSE Composite)
        '^IXIC' (NASDAQ Composite)

        See: http://finance.yahoo.com/indices for other index symbols

    Returns
    -------
    idx_df : DataFrame
    """
    stats = "snx"
    # URL of form:
    # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
    url = _URL + "s={0}&f={1}&e=.csv&h={2}"

    idx_mod = idx_sym.replace("^", "@%5E")
    url_str = url.format(idx_mod, stats, 1)

    idx_df = DataFrame()
    mask = [True]
    comp_idx = 1

    # LOOP across component index structure,
    # break when no new components are found
    while True in mask:
        url_str = url.format(idx_mod, stats, comp_idx)
        with urlopen(url_str) as resp:
            raw = resp.read()
        lines = raw.decode("utf-8").strip().strip('"').split('"\r\n"')
        lines = [line.strip().split('","') for line in lines]

        temp_df = DataFrame(lines, columns=["ticker", "name", "exchange"])
        temp_df = temp_df.drop_duplicates()
        temp_df = temp_df.set_index("ticker")
        mask = ~temp_df.index.isin(idx_df.index)

        comp_idx = comp_idx + 50
        idx_df = idx_df.append(temp_df[mask])

    return idx_df
def PrepeareAndSave(uid, dataToSearch, data, fileName="data.csv"):
    people = []
    address = []
    poi = []
    activity = []
    timeB = []
    timeE = []
    entities = []
    taxonomy = []
    concepts = []
    txt = []

    for i in xrange(0, len(dataToSearch["people"])):
        for j in xrange(0, len(data[i][2])):
            people.append(dataToSearch["people"][i])
            address.append(dataToSearch["address"][i])
            poi.append(dataToSearch["poi"][i])
            activity.append(dataToSearch["activity"][i])
            timeB.append(dataToSearch["timeB"][i])
            timeE.append(dataToSearch["timeE"][i])

            entities.append(data[i][2][j])
            taxonomy.append(data[i][3][j])
            concepts.append(data[i][4][j])
            txt.append(base64.b64encode(data[i][1][j]))

    df = DataFrame()
    df["people"] = people
    df["address"] = address
    df["poi"] = poi
    df["activity"] = activity
    df["entities"] = entities
    df["taxonomy"] = taxonomy
    df["concepts"] = concepts
    df["timeB"] = timeB
    df["timeE"] = timeE
    df["txt"] = txt
    df = df.drop_duplicates(take_last=True).reset_index().drop(["index"], axis=1)
    # print df
    try:
        df_old = read_csv(ROOT + str(uid) + "/" + fileName, ";")
        df_new = [df_old, df]
        df = pd.concat(df_new).drop(["Unnamed: 0"], axis=1)
    except:
        print ("New Data")
    df.to_csv(ROOT + str(uid) + "/" + fileName, sep=";")
    return df
Example #9
0
    def compute_tf_idf(self):
        # Find total number of document
        results = self.cursor.execute("SELECT seq FROM sqlite_sequence WHERE name='{}'".format("documents"))
        tmp = results.fetchone()
        total_doc = tmp[0]

        results = self.cursor.execute("SELECT did, type, entity FROM entities")
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=["did", "e_type", "entity"])

        base_df = df[["e_type", "entity"]]
        base_df = base_df.drop_duplicates()

        doc_t_df = df.drop_duplicates().groupby("entity").size()

        results = self.cursor.execute("SELECT did, total_word FROM documents")
        tmp = results.fetchall()
        df2 = DataFrame(tmp, columns=["did", "total_word"])

        tmp = df[["did", "entity"]].groupby(["did", "entity"]).size().reset_index()
        tmp.rename(columns={0: "term_freq"}, inplace=True)

        tf_idf_list = []

        for row in tmp.iterrows():
            values = row[1]
            did = values[0]
            entity = values[1]
            term_freq = values[2]
            total_word = df2[df2["did"] == did]["total_word"].get_values()[0]
            tf = float(term_freq) / total_word
            doc_t = doc_t_df.get_value(entity)
            idf = np.log(total_doc / doc_t)
            tf_idf = tf * idf
            tf_idf_list.append([entity, tf_idf])

        tf_idf_df = DataFrame(tf_idf_list, columns=["entity", "tf_idf"])
        tf_idf_df = tf_idf_df.groupby("entity").agg("sum")

        base_df.loc[:, "tf_idf"] = base_df["entity"].apply(lambda x: tf_idf_df["tf_idf"][x])

        return base_df
Example #10
0
def hierarchical_clusters(log, show_plot=None):
    """Translates traces to Parikh vectors and computes in the vector space
       a hierarchical clustering."""

    def get_parikh(case, alphabet):
        v = zeros(len(alphabet), dtype=int)
        for act in case:
            v[alphabet[act]] = v[alphabet[act]] + 1
        # canonical representation
        m = min(v)
        return v - m

    actsind = {}
    i = 0
    for act in log.get_alphabet():
        actsind[act] = i
        i = i + 1

    uniq_cases = log.get_uniq_cases()
    N = len(uniq_cases)
    M = len(actsind)
    data = zeros((N, M), dtype=int)
    i = 0
    parikhdict = {}
    for case in uniq_cases.keys():
        data[i] = get_parikh(case, actsind)
        str_i = ",".join(map(str, data[i]))
        if str_i not in parikhdict:
            parikhdict[str_i] = [i]
        else:
            parikhdict[str_i].append(i)
        i = i + 1
    df = DataFrame(data)
    data_uniq = df.drop_duplicates()
    Y = pdist(data_uniq, metric="euclidean")
    Z = linkage(Y, method="average")
    dendrogram(Z)
    show()
from io import StringIO

###############################################################

data = DataFrame({"k1": ["one"] * 3 + ["two"] * 4, "k2": [1, 1, 2, 3, 3, 4, 4]})

print(data)
print("\n")

print(data.duplicated())
print("\n")

print(data.drop_duplicates())
print("\n")

data["v1"] = range(7)
print(data.drop_duplicates(["k1"]))
print("\n")

print(data.drop_duplicates(["k1", "k2"], take_last=True))
print("\n")

data = DataFrame(
    {
        "food": [
            "bacon",
            "pulled pork",
            "bacon",
            "Pastrami",
            "corned beef",
            "Bacon",
            "pastrami",
Example #12
0
print(nutrients[:7])

info_keys = ["description", "group", "id", "manufacturer"]
info = DataFrame(db, columns=info_keys)
print(pd.value_counts(info.group)[:10])

nutrients = []
for rec in db:
    fnuts = DataFrame(rec["nutrients"])
    fnuts["id"] = rec["id"]
    nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
print(nutrients[:10])
print(nutrients.duplicated().sum())

nutrients = nutrients.drop_duplicates()

col_mapping = {"description": "food", "group": "fgroup"}
info = info.rename(columns=col_mapping, copy=False)
print(info[:10])

col_mapping = {"description": "nutrient", "group": "nutgroup"}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print(nutrients[:10])

ndata = pd.merge(nutrients, info, on="id", how="outer")
print(ndata[:10])
print(ndata.ix[30000])

result = ndata.groupby(["nutrient", "fgroup"])["value"].quantile(0.5)
result["Zinc, Zn"].sort_values().plot(kind="barh")
Example #13
0
df1 = DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"], "data1": range(7)})
df2 = DataFrame({"key": ["a", "b", "d"], "data2": range(3)})
# dfMerged = pd.merge(df1, df2, on='key')
# print dfMerged
# dfMergedOuter = pd.merge(df1, df2, how='outer')
# print dfMergedOuter

df3 = DataFrame({"lkey": ["b", "b", "a", "c", "a", "a", "b"], "data1": range(7)})
df4 = DataFrame({"rkey": ["a", "b", "d"], "data2": range(3)})
# dfMerged = pd.merge(df3, df4, left_on='lkey', right_on='rkey')
# print dfMerged

left = DataFrame({"key1": ["foo", "foo", "bar"], "key2": ["one", "foo", "one"], "lval": [1, 2, 3]})

right = DataFrame({"key1": ["foo", "foo", "bar", "bar"], "key2": ["one", "foo", "one", "one"], "rval": [4, 5, 6, 7]})

dfMergedOuter = pd.merge(left, right, how="outer")
# print dfMergedOuter

arr = np.arange(12).reshape((6, 2))
# print arr
# print np.arange(12)
arrConcat = np.concatenate([arr, arr], axis=1)
# print arrConcat

data = DataFrame({"k1": ["one"] * 3 + ["two"] * 4, "k2": [1, 1, 2, 3, 3, 4, 4]})
# print data
dataDuplicate = data.duplicated()
print dataDuplicate
dropDuplicate = data.drop_duplicates()
print dropDuplicate
Example #14
0
def detect_vec(
    df,
    max_anoms=0.10,
    direction="pos",
    alpha=0.05,
    period=None,
    only_last=False,
    threshold=None,
    e_value=False,
    longterm_period=None,
    plot=False,
    y_log=False,
    xlabel="",
    ylabel="count",
    title=None,
    verbose=False,
):
    """
    Anomaly Detection Using Seasonal Hybrid ESD Test

    A technique for detecting anomalies in seasonal univariate time series where the input is a
    series of observations.

    Args:
    x: Time series as a column data frame, list, or vector, where the column consists of
    the observations.

    max_anoms: Maximum number of anomalies that S-H-ESD will detect as a percentage of the
    data.

    direction: Directionality of the anomalies to be detected. Options are: ('pos' | 'neg' | 'both').

    alpha: The level of statistical significance with which to accept or reject anomalies.
    period: Defines the number of observations in a single period, and used during seasonal
    decomposition.

    only_last: Find and report anomalies only within the last period in the time series.
    threshold: Only report positive going anoms above the threshold specified. Options are: ('None' | 'med_max' | 'p95' | 'p99').

    e_value: Add an additional column to the anoms output containing the expected value.

    longterm_period: Defines the number of observations for which the trend can be considered
    flat. The value should be an integer multiple of the number of observations in a single period.
    This increases anom detection efficacy for time series that are greater than a month.

    plot: (Currently unsupported) A flag indicating if a plot with both the time series and the estimated anoms,
    indicated by circles, should also be returned.

    y_log: Apply log scaling to the y-axis. This helps with viewing plots that have extremely
    large positive anomalies relative to the rest of the data.

    xlabel: X-axis label to be added to the output plot.
    ylabel: Y-axis label to be added to the output plot.

    Details

    'longterm_period' This option should be set when the input time series is longer than a month.
    The option enables the approach described in Vallis, Hochenbaum, and Kejariwal (2014).

    'threshold' Filter all negative anomalies and those anomalies whose magnitude is smaller
    than one of the specified thresholds which include: the median
    of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the
    99th percentile of the daily max values (p99).

    'title' Title for the output plot.
    'verbose' Enable debug messages

    The returned value is a dictionary with the following components:
      anoms: Data frame containing index, values, and optionally expected values.
      plot: A graphical object if plotting was requested by the user. The plot contains
    the estimated anomalies annotated on the input time series.
    """

    if isinstance(df, DataFrame) and len(df.columns) == 1 and df.iloc[:, 0].applymap(np.isreal).all(1):
        d = {"timestamp": range(len(df.iloc[:, 0])), "value": df.iloc[:, 0]}
        df = DataFrame(d, index=d["timestamp"])
    elif isinstance(df, Series):
        d = {"timestamp": range(len(df)), "value": df}
        df = DataFrame(d, index=d["timestamp"])
    else:
        raise ValueError(("data must be a single data frame, " "list, or vector that holds numeric values."))

    if max_anoms > 0.49:
        length = len(df.value)
        raise ValueError(
            ("max_anoms must be less than 50% of " "the data points (max_anoms =%f data_points =%s).")
            % (round(max_anoms * length, 0), length)
        )

    if not direction in ["pos", "neg", "both"]:
        raise ValueError("direction options are: pos | neg | both.")

    if not (0.01 <= alpha or alpha <= 0.1):
        if verbose:
            import warnings

            warnings.warn(("alpha is the statistical signifigance, " "and is usually between 0.01 and 0.1"))

    if not period:
        raise ValueError(("Period must be set to the number " "of data points in a single period"))

    if not isinstance(only_last, bool):
        raise ValueError("only_last must be a boolean")

    if not threshold in [None, "med_max", "p95", "p99"]:
        raise ValueError("threshold options are: None | med_max | p95 | p99")

    if not isinstance(e_value, bool):
        raise ValueError("e_value must be a boolean")

    if not isinstance(plot, bool):
        raise ValueError("plot must be a boolean")

    if not isinstance(y_log, bool):
        raise ValueError("y_log must be a boolean")

    if not isinstance(xlabel, basestring):
        raise ValueError("xlabel must be a string")

    if not isinstance(ylabel, basestring):
        raise ValueError("ylabel must be a string")

    if title and not isinstance(title, basestring):
        raise ValueError("title must be a string")

    if not title:
        title = ""
    else:
        title = title + " : "

    # -- Main analysis: Perform S-H-ESD

    num_obs = len(df.value)

    clamp = 1 / float(num_obs)
    if max_anoms < clamp:
        max_anoms = clamp

    # -- Setup for longterm time series

    # If longterm is enabled, break the data into subset
    # data frames and store in all_data,

    if longterm_period:
        all_data = []
        for j in range(0, len(df.timestamp), longterm_period):
            start_index = df.timestamp.iget(j)
            end_index = min((start_index + longterm_period), num_obs)
            if (end_index - start_index) == longterm_period:
                sub_df = df[(df.timestamp >= start_index) & (df.timestamp <= end_index)]
            else:
                sub_df = df[(df.timestamp >= (num_obs - longterm_period)) & (df.timestamp <= num_obs)]
            all_data.append(sub_df)
    else:
        all_data = [df]

    # Create empty data frames to store all anoms and
    # seasonal+trend component from decomposition
    all_anoms = DataFrame(columns=["timestamp", "value"])
    seasonal_plus_trend = DataFrame(columns=["timestamp", "value"])

    # Detect anomalies on all data (either entire data in one-pass,
    # or in 2 week blocks if longterm=TRUE)
    for i in range(len(all_data)):
        directions = {"pos": Direction(True, True), "neg": Direction(True, False), "both": Direction(False, True)}
        anomaly_direction = directions[direction]

        s_h_esd_timestamps = detect_anoms(
            all_data[i],
            k=max_anoms,
            alpha=alpha,
            num_obs_per_period=period,
            use_decomp=True,
            one_tail=anomaly_direction.one_tail,
            upper_tail=anomaly_direction.upper_tail,
            verbose=verbose,
        )

        # store decomposed components in local variable and
        # overwrite s_h_esd_timestamps to contain only the anom timestamps
        data_decomp = s_h_esd_timestamps["stl"]
        s_h_esd_timestamps = s_h_esd_timestamps["anoms"]

        # -- Step 3: Use detected anomaly timestamps to
        # extract the actual anomalies (timestamp and value) from the data
        if s_h_esd_timestamps:
            anoms = all_data[i][all_data[i].timestamp.isin(s_h_esd_timestamps)]
        else:
            anoms = DataFrame(columns=["timestamp", "value"])

        # Filter the anomalies using one of the thresholding
        # functions if applicable
        if threshold:
            # Calculate daily max values
            if isinstance(all_data[i].index[0], np.int64):
                group = all_data[i].timestamp.map(lambda t: t / period)
            else:
                group = all_data[i].timestamp.map(Timestamp.date)
            periodic_maxes = df.groupby(group).aggregate(np.max).value

            # Calculate the threshold set by the user
            if threshold == "med_max":
                thresh = periodic_maxes.median()
            elif threshold == "p95":
                thresh = periodic_maxes.quantile(0.95)
            elif threshold == "p99":
                thresh = periodic_maxes.quantile(0.99)

            # Remove any anoms below the threshold
            anoms = anoms[anoms.value >= thresh]

        all_anoms = all_anoms.append(anoms)
        seasonal_plus_trend = seasonal_plus_trend.append(data_decomp)

    # Cleanup potential duplicates
    all_anoms.drop_duplicates(subset=["timestamp"])
    seasonal_plus_trend.drop_duplicates(subset=["timestamp"])

    # -- If only_last was set by the user, create subset of
    # the data that represent the most recent period
    if only_last:
        d = {"timestamp": df.timestamp.iloc[-period:], "value": df.value.iloc[-period:]}
        x_subset_single_period = DataFrame(d, index=d["timestamp"])
        past_obs = period * 7
        if num_obs < past_obs:
            past_obs = num_obs - period
        # When plotting anoms for the last period only we only show
        # the previous 7 periods of data
        d = {"timestamp": df.timestamp.iloc[-past_obs:-period], "value": df.value.iloc[-past_obs:-period]}
        x_subset_previous = DataFrame(d, index=d["timestamp"])
        all_anoms = all_anoms[all_anoms.timestamp >= x_subset_single_period.timestamp.iget(0)]
        num_obs = len(x_subset_single_period.value)

    # Calculate number of anomalies as a percentage
    anom_pct = (len(df.value) / float(num_obs)) * 100

    if anom_pct == 0:
        return {"anoms": None, "plot": None}

    # The original R implementation handles plotting here.
    # Plotting is currently not implemented.
    # if plot:
    #     plot_something()

    all_anoms.index = all_anoms.timestamp

    if e_value:
        d = {
            "timestamp": all_anoms.timestamp,
            "anoms": all_anoms.value,
            "expected_value": seasonal_plus_trend[seasonal_plus_trend.timestamp.isin(all_anoms.timestamp)].value,
        }
    else:
        d = {"timestamp": all_anoms.timestamp, "anoms": all_anoms.value}
    anoms = DataFrame(d, index=d["timestamp"].index)

    return {"anoms": anoms, "plot": None}
Example #15
0
def detect_ts(
    df,
    max_anoms=0.10,
    direction="pos",
    alpha=0.05,
    only_last=None,
    threshold=None,
    e_value=False,
    longterm=False,
    piecewise_median_period_weeks=2,
    plot=False,
    y_log=False,
    xlabel="",
    ylabel="count",
    title=None,
    verbose=False,
):
    """
    Anomaly Detection Using Seasonal Hybrid ESD Test
    A technique for detecting anomalies in seasonal univariate time series where the input is a
    series of <timestamp, value> pairs.

    Args:

    x: Time series as a two column data frame where the first column consists of the
    timestamps and the second column consists of the observations.

    max_anoms: Maximum number of anomalies that S-H-ESD will detect as a percentage of the
    data.

    direction: Directionality of the anomalies to be detected. Options are: ('pos' | 'neg' | 'both').

    alpha: The level of statistical significance with which to accept or reject anomalies.

    only_last: Find and report anomalies only within the last day or hr in the time series. Options: (None | 'day' | 'hr')

    threshold: Only report positive going anoms above the threshold specified. Options are: (None | 'med_max' | 'p95' | 'p99')

    e_value: Add an additional column to the anoms output containing the expected value.

    longterm: Increase anom detection efficacy for time series that are greater than a month.

    See Details below.
    piecewise_median_period_weeks: The piecewise median time window as described in Vallis, Hochenbaum, and Kejariwal (2014). Defaults to 2.

    plot: (Currently unsupported) A flag indicating if a plot with both the time series and the estimated anoms,
    indicated by circles, should also be returned.

    y_log: Apply log scaling to the y-axis. This helps with viewing plots that have extremely
    large positive anomalies relative to the rest of the data.

    xlabel: X-axis label to be added to the output plot.
    ylabel: Y-axis label to be added to the output plot.

    Details


    'longterm' This option should be set when the input time series is longer than a month.
    The option enables the approach described in Vallis, Hochenbaum, and Kejariwal (2014).
    'threshold' Filter all negative anomalies and those anomalies whose magnitude is smaller
    than one of the specified thresholds which include: the median
    of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the
    99th percentile of the daily max values (p99).
    'title' Title for the output plot.
    'verbose' Enable debug messages

    The returned value is a dictionary with the following components:
      anoms: Data frame containing timestamps, values, and optionally expected values.
      plot: A graphical object if plotting was requested by the user. The plot contains
      the estimated anomalies annotated on the input time series
    """

    if not isinstance(df, DataFrame):
        raise ValueError("data must be a single data frame.")
    else:
        if len(df.columns) != 2 or not df.iloc[:, 1].map(np.isreal).all():
            raise ValueError(
                (
                    "data must be a 2 column data.frame, with the"
                    "first column being a set of timestamps, and "
                    "the second coloumn being numeric values."
                )
            )

        if not (df.dtypes[0].type is np.datetime64) and not (df.dtypes[0].type is np.int64):
            df = format_timestamp(df)

    if list(df.columns.values) != ["timestamp", "value"]:
        df.columns = ["timestamp", "value"]

    # Sanity check all input parameters
    if max_anoms > 0.49:
        length = len(df.value)
        raise ValueError(
            ("max_anoms must be less than 50% of " "the data points (max_anoms =%f data_points =%s).")
            % (round(max_anoms * length, 0), length)
        )

    if not direction in ["pos", "neg", "both"]:
        raise ValueError("direction options are: pos | neg | both.")

    if not (0.01 <= alpha or alpha <= 0.1):
        if verbose:
            import warnings

            warnings.warn(("alpha is the statistical signifigance, " "and is usually between 0.01 and 0.1"))

    if only_last and not only_last in ["day", "hr"]:
        raise ValueError("only_last must be either 'day' or 'hr'")

    if not threshold in [None, "med_max", "p95", "p99"]:
        raise ValueError("threshold options are: None | med_max | p95 | p99")

    if not isinstance(e_value, bool):
        raise ValueError("e_value must be a boolean")

    if not isinstance(longterm, bool):
        raise ValueError("longterm must be a boolean")

    if piecewise_median_period_weeks < 2:
        raise ValueError("piecewise_median_period_weeks must be at greater than 2 weeks")

    if not isinstance(plot, bool):
        raise ValueError("plot must be a boolean")

    if not isinstance(y_log, bool):
        raise ValueError("y_log must be a boolean")

    if not isinstance(xlabel, basestring):
        raise ValueError("xlabel must be a string")

    if not isinstance(ylabel, basestring):
        raise ValueError("ylabel must be a string")

    if title and not isinstance(title, basestring):
        raise ValueError("title must be a string")

    if not title:
        title = ""
    else:
        title = title + " : "

    gran = get_gran(df)

    if gran == "day":
        num_days_per_line = 7
        if isinstance(only_last, basestring) and only_last == "hr":
            only_last = "day"
    else:
        num_days_per_line = 1

    if gran == "sec":
        df.timestamp = date_format(df.timestamp, "%Y-%m-%d %H:%M:00")
        df = format_timestamp(df.groupby("timestamp").aggregate(np.sum))

    # if the data is daily, then we need to bump
    # the period to weekly to get multiple examples
    gran_period = {"min": 1440, "hr": 24, "day": 7}
    period = gran_period.get(gran)
    if not period:
        raise ValueError("%s granularity detected. This is currently not supported." % gran)
    num_obs = len(df.value)

    clamp = 1 / float(num_obs)
    if max_anoms < clamp:
        max_anoms = clamp

    if longterm:
        if gran == "day":
            num_obs_in_period = period * piecewise_median_period_weeks + 1
            num_days_in_period = 7 * piecewise_median_period_weeks + 1
        else:
            num_obs_in_period = period * 7 * piecewise_median_period_weeks
            num_days_in_period = 7 * piecewise_median_period_weeks

        last_date = df.timestamp.iget(-1)

        all_data = []

        for j in range(0, len(df.timestamp), num_obs_in_period):
            start_date = df.timestamp.iget(j)
            end_date = min(start_date + datetime.timedelta(days=num_obs_in_period), df.timestamp.iget(-1))

            # if there is at least 14 days left, subset it,
            # otherwise subset last_date - 14days
            if (end_date - start_date).days == num_days_in_period:
                sub_df = df[(df.timestamp >= start_date) & (df.timestamp < end_date)]
            else:
                sub_df = df[
                    (df.timestamp > (last_date - datetime.timedelta(days=num_days_in_period)))
                    & (df.timestamp <= last_date)
                ]
            all_data.append(sub_df)
    else:
        all_data = [df]

    all_anoms = DataFrame(columns=["timestamp", "value"])
    seasonal_plus_trend = DataFrame(columns=["timestamp", "value"])

    # Detect anomalies on all data (either entire data in one-pass,
    # or in 2 week blocks if longterm=TRUE)
    for i in range(len(all_data)):
        directions = {"pos": Direction(True, True), "neg": Direction(True, False), "both": Direction(False, True)}
        anomaly_direction = directions[direction]

        # detect_anoms actually performs the anomaly detection and
        # returns the results in a list containing the anomalies
        # as well as the decomposed components of the time series
        # for further analysis.

        s_h_esd_timestamps = detect_anoms(
            all_data[i],
            k=max_anoms,
            alpha=alpha,
            num_obs_per_period=period,
            use_decomp=True,
            one_tail=anomaly_direction.one_tail,
            upper_tail=anomaly_direction.upper_tail,
            verbose=verbose,
        )

        # store decomposed components in local variable and overwrite
        # s_h_esd_timestamps to contain only the anom timestamps
        data_decomp = s_h_esd_timestamps["stl"]
        s_h_esd_timestamps = s_h_esd_timestamps["anoms"]

        # -- Step 3: Use detected anomaly timestamps to extract the actual
        # anomalies (timestamp and value) from the data
        if s_h_esd_timestamps:
            anoms = all_data[i][all_data[i].timestamp.isin(s_h_esd_timestamps)]
        else:
            anoms = DataFrame(columns=["timestamp", "value"])

        # Filter the anomalies using one of the thresholding functions if applicable
        if threshold:
            # Calculate daily max values
            periodic_maxes = df.groupby(df.timestamp.map(Timestamp.date)).aggregate(np.max).value

            # Calculate the threshold set by the user
            if threshold == "med_max":
                thresh = periodic_maxes.median()
            elif threshold == "p95":
                thresh = periodic_maxes.quantile(0.95)
            elif threshold == "p99":
                thresh = periodic_maxes.quantile(0.99)

            # Remove any anoms below the threshold
            anoms = anoms[anoms.value >= thresh]

        all_anoms = all_anoms.append(anoms)
        seasonal_plus_trend = seasonal_plus_trend.append(data_decomp)

    # Cleanup potential duplicates
    try:
        all_anoms.drop_duplicates(subset=["timestamp"])
        seasonal_plus_trend.drop_duplicates(subset=["timestamp"])
    except TypeError:
        all_anoms.drop_duplicates(cols=["timestamp"])
        seasonal_plus_trend.drop_duplicates(cols=["timestamp"])

    # -- If only_last was set by the user,
    # create subset of the data that represent the most recent day
    if only_last:
        start_date = df.timestamp.iget(-1) - datetime.timedelta(days=7)
        start_anoms = df.timestamp.iget(-1) - datetime.timedelta(days=1)
        if gran is "day":
            breaks = 3 * 12
            num_days_per_line = 7
        else:
            if only_last == "day":
                breaks = 12
            else:
                start_date = df.timestamp.iget(-1) - datetime.timedelta(days=2)
                # truncate to days
                start_date = datetime.date(start_date.year, start_date.month, start_date.day)
                start_anoms = df.timestamp.iget(-1) - datetime.timedelta(hours=1)
                breaks = 3

        # subset the last days worth of data
        x_subset_single_day = df[df.timestamp > start_anoms]
        # When plotting anoms for the last day only
        # we only show the previous weeks data
        x_subset_week = df[(df.timestamp <= start_anoms) & (df.timestamp > start_date)]
        if len(all_anoms) > 0:
            all_anoms = all_anoms[all_anoms.timestamp >= x_subset_single_day.timestamp.iget(0)]
        num_obs = len(x_subset_single_day.value)

    # Calculate number of anomalies as a percentage
    anom_pct = (len(df.value) / float(num_obs)) * 100

    if anom_pct == 0:
        return {"anoms": None, "plot": None}

    # The original R implementation handles plotting here.
    # Plotting is currently not implemented in this version.
    # if plot:
    #     plot_something()

    all_anoms.index = all_anoms.timestamp

    if e_value:
        d = {
            "timestamp": all_anoms.timestamp,
            "anoms": all_anoms.value,
            "expected_value": seasonal_plus_trend[seasonal_plus_trend.timestamp.isin(all_anoms.timestamp)].value,
        }
    else:
        d = {"timestamp": all_anoms.timestamp, "anoms": all_anoms.value}
    anoms = DataFrame(d, index=d["timestamp"].index)

    return {"anoms": anoms, "plot": None}
Example #16
0
class data(object):
    def __init__(self):
        self.a = api()
        self.s = sql()
        self.jobs = []
        self.trd = DataFrame()
        self.prc = DataFrame()

    def add_trades(self, exchange, symbol, limit="", since="", auto_since="no", ping_limit=1.0):
        job = {"exchange": exchange, "symbol": symbol}
        self.a.add_job(exchange, symbol, limit=limit, since=since, auto_since=auto_since, ping_limit=ping_limit)
        self.jobs.append(job)

    def get_trades(self, exchange="", symbol="", start=""):
        trd = self.s.select("trades", exchange=exchange, symbol=symbol, start=start)
        self.trd = self.trd.append(trd)
        self.trd = self.trd.drop_duplicates(["tid", "exchange"])

    def run_trades(self, exchange, symbol):
        self.trd = self.trd.append(self.a.run(exchange, symbol))
        self.trd = self.trd.drop_duplicates(["tid", "exchange"])

    def run_loop(self, time, to_sql=60, log="no"):
        dump = tm.time() + to_sql
        end = tm.time() + time
        while tm.time() < end:
            for job in self.jobs:
                self.run_trades(job["exchange"], job["symbol"])
            if tm.time() > dump:
                dump = tm.time() + to_sql
                self.to_sql(log)

    def get_price(self, exchange="", symbol="", freq="", start=""):
        prc = self.s.select("price", exchange=exchange, symbol=symbol, freq=freq, start=start)
        self.prc = self.prc.append(prc)
        self.prc = self.prc.drop_duplicates(["timestamp", "exchange", "symbol", "freq"])
        return prc

    def run_price(self, exchange, symbol, freq, label="left", from_sql="no", start=""):
        if from_sql == "yes":
            self.get_trades(exchange, symbol, start=start)
            # get_trades already applied exchange, symbol checks
            trd = self.trd
        else:
            trd = self.trd
            if exchange <> "":
                trd = self.trd[self.trd.exchange == exchange]
            if symbol <> "":
                trd = self.trd[self.trd.symbol == symbol]
        trd = tools.date_index(trd)
        if len(trd.index) > 0:
            prc = conv.olhcv(trd, freq, label=label)
            self.prc = self.prc.append(prc)
            self.prc = self.prc.drop_duplicates(["timestamp", "exchange", "symbol", "freq"])

    def to_sql(self, log="no"):
        if "sent" in self.trd:
            trd = self.trd[self.trd["sent"] <> "yes"]
        else:
            trd = self.trd
        if "sent" in self.prc:
            prc = self.prc[self.prc["sent"] <> "yes"]
        else:
            prc = self.prc
        self.s.insert("trades", trd)
        self.s.insert("price", prc)
        if log == "yes":
            print trd
            print prc
        self.trd["sent"] = "yes"
        self.prc["sent"] = "yes"
Example #17
0
for index, row in edges.iterrows():
    if row["source"] in ID and row["target"] in ID:
        source.append(row["source"])
        target.append(row["target"])

print "%d connections obtained from previous record." % len(source)

files = []

for file in os.listdir("data/friends"):
    if file.endswith(".txt"):
        files.append(file)

i = 0

for thisFile in files:
    thisId = int(thisFile.replace(".txt", ""))
    d = DataFrame.from_csv("data/friends/" + thisFile)
    ids = list(d.index)
    for thisFriend in ids:
        if thisFriend in ID:
            i += 1
            source.append(thisId)
            target.append(thisFriend)

print "%d connections obtained from new friends list." % i

net = {"source": source, "target": target}
df = DataFrame(net, columns=["source", "target"])
df.drop_duplicates(inplace=True)
df.to_csv("data/edge.csv", index=False)
for x in [x1 / 10.0 for x1 in range(-3 * gridSize, 3 * gridSize)]:
    for y in [y1 / 10.0 for y1 in range(-3 * gridSize, 3 * gridSize)]:
        center = distcust(p, distance, x, y)
        url = (
            "https://api.foursquare.com/v2/venues/search?ll=%s,%s&intent=browse&radius=%s&categoryId=%s&client_id=%s&client_secret=%s&v=%s"
            % (center["lat"], center["long"], distance, category_id, client_id, client_secret, time.strftime("%Y%m%d"))
        )
        try:
            req = urllib2.Request(url)
            response = urllib2.urlopen(req)
            data = json.loads(response.read())
            response.close()

            data = DataFrame(data["response"]["venues"])[requested_keys]

            df = df.append(data, ignore_index=True)
            print center
            time.sleep(1)  # stay within API limits
        except Exception, e:
            print e

df = df.drop_duplicates(cols="id", take_last=True)

df["categories"] = df["categories"].apply(lambda x: dict(x[0])["name"])
df["lat"] = df["location"].apply(lambda x: dict(x)["lat"])
df["long"] = df["location"].apply(lambda x: dict(x)["lng"])
df["checkins"] = df["stats"].apply(lambda x: dict(x)["checkinsCount"])

ordered_df = df[["name", "id", "categories", "lat", "long", "checkins"]]
ordered_df.to_csv("foursquare_%s_nyc.csv" % category, encoding="utf-8", index=False)
def ReadDataToProcess(uid, fileName="predata.csv"):
    streets = read_csv(ROOT + str(uid) + "/streets.csv", ",")
    graph = read_csv(ROOT + str(uid) + "/graph.csv", ";")
    poi = read_csv(ROOT + str(uid) + "/poi.csv", ",")
    places = read_csv(ROOT + str(uid) + "/places.csv", ",")
    places["start"] = places["start_time"]
    places["end"] = places["end_time"]
    activities = read_csv(ROOT + str(uid) + "/activities.csv", ",")
    moveData = GetMoveData(activities)
    activities = activities.append(moveData).drop_duplicates(take_last=True).reset_index().drop(["index"], axis=1)
    # activities.drop_duplicates(take_last=True).reset_index().drop(['index'], axis=1)
    places = places.drop_duplicates(take_last=True).reset_index().drop(["index"], axis=1)

    union = places.merge(streets, how="left", on=["lat", "lon", "start_time", "end_time"])
    union = union.drop_duplicates(take_last=True).reset_index().drop(["index"], axis=1)
    union = union.merge(poi, how="left", on=["lat", "lon"])
    union = union.drop(
        ["nearest_poi_name", "nearest_poi_addres", "poi_lat", "poi_lon", "start_time", "end_time"], axis=1
    )
    union = union.drop_duplicates(take_last=True).reset_index().drop(["index"], axis=1)

    address = []
    poi = []
    activity = []
    timeB = []
    timeE = []
    lat = []
    lon = []
    for i in range(0, len(activities["start"])):
        for j in range(0, len(union["start"])):

            types = str(union["nearest_poi_type"][j]).split("-")
            if (union["start"][j] + union["end"][j]) == (activities["start"][i] + activities["end"][i]):
                for k in types:
                    address.append(union["address"][j])
                    poi.append(k)
                    activity.append(activities["activity"][i])
                    lat.append(union["lat"][j])
                    lon.append(union["lon"][j])
                    timeB.append(union["start"][j])
                    timeE.append(union["end"][j])

    result = DataFrame()

    name = ""
    for i in range(0, len(graph["Relation"])):
        if graph["Relation"][i] == "is_located":
            name = graph["from"][i]
            break

    result["lat"] = lat
    result["lon"] = lon
    result["people"] = name
    result["address"] = address
    result["poi"] = poi
    result["activity"] = activity
    result["timeB"] = timeB
    result["timeE"] = timeE
    result = result.drop_duplicates(take_last=True).reset_index().drop(["index"], axis=1)
    try:
        time = read_csv(ROOT + str(uid) + "/" + "data.csv", ";")
        result = ReplaceOld(result, time)
    except:
        print ("All Data New")
    result.to_csv(ROOT + str(uid) + "/" + fileName, sep=";")
    return result
Example #20
0
        "datetimestatus": status,
    }
)

dubdata.airline = dubdata.airline.map(stripletters)
dubdata.airline = dubdata.airline.map(airline_mapping)
dubdata.datetimestatus = dubdata.datetimestatus.map(getstatustime)
import dateutil

dubdata.scheduled = dubdata.scheduled.map(lambda x: dateutil.parser.parse(x, fuzzy=True))
dubdata.scheduled = dubdata.scheduled.map(lambda x: arrow.get(x))


gatwdata = gatwdata.dropna()
dubdata = dubdata.dropna()
dubdata = dubdata.drop_duplicates()
gatwdata = gatwdata.drop_duplicates()

# gatwdata = gatwdata[gatwdata['status'].str.contains("LANDED")]
import pandas as pd

pieces = [gatwdata[gatwdata["status"].str.contains("LANDED")], gatwdata[gatwdata["status"].str.contains("DEPARTED")]]
gatwdata = pd.concat(pieces)

pieces = [dubdata[dubdata["status"].str.contains("Departed")], dubdata[dubdata["status"].str.contains("Arrived")]]
dubdata = pd.concat(pieces)


dubdata["datescheduled"] = dubdata["scheduled"]
dubdata.datescheduled = dubdata.datescheduled.map(lambda x: x.date())

gatwdata["datescheduled"] = gatwdata["scheduled"]
Example #21
0
def detect_vec(
    df,
    max_anoms=0.10,
    direction="pos",
    alpha=0.05,
    period=None,
    only_last=False,
    threshold=None,
    e_value=False,
    longterm_period=None,
    plot=False,
    y_log=False,
    xlabel="",
    ylabel="count",
    title=None,
    verbose=False,
):

    if isinstance(df, DataFrame) and len(df.columns) == 1 and df.iloc[:, 0].applymap(np.isreal).all(1):
        d = {"timestamp": range(len(df.iloc[:, 0])), "count": df.iloc[:, 0]}
        df = DataFrame(d, index=d["timestamp"])
    elif isinstance(df, Series):
        d = {"timestamp": range(len(df)), "count": df}
        df = DataFrame(d, index=d["timestamp"])
    else:
        raise ValueError(("data must be a single data frame, " "list, or vector that holds numeric values."))

    if max_anoms > 0.49:
        length = len(df.iloc[:, 1])
        raise ValueError(
            ("max_anoms must be less than 50% of " "the data points (max_anoms =%f data_points =%s).")
            % (round(max_anoms * length, 0), length)
        )

    if not direction in ["pos", "neg", "both"]:
        raise ValueError("direction options are: pos | neg | both.")

    if not (0.01 <= alpha or alpha <= 0.1):
        if verbose:
            import warnings

            warnings.warn(("alpha is the statistical signifigance, " "and is usually between 0.01 and 0.1"))

    if not period:
        raise ValueError(("Period must be set to the number " "of data points in a single period"))

    if not isinstance(only_last, bool):
        raise ValueError("only_last must be a boolean")

    if not threshold in [None, "med_max", "p95", "p99"]:
        raise ValueError("threshold options are: None | med_max | p95 | p99")

    if not isinstance(e_value, bool):
        raise ValueError("e_value must be a boolean")

    if not isinstance(plot, bool):
        raise ValueError("plot must be a boolean")

    if not isinstance(y_log, bool):
        raise ValueError("y_log must be a boolean")

    if not isinstance(xlabel, basestring):
        raise ValueError("xlabel must be a string")

    if not isinstance(ylabel, basestring):
        raise ValueError("ylabel must be a string")

    if title and not isinstance(title, basestring):
        raise ValueError("title must be a string")

    if not title:
        title = ""
    else:
        title = title + " : "

    # -- Main analysis: Perform S-H-ESD

    num_obs = len(df["count"])

    clamp = 1 / float(num_obs)
    if max_anoms < clamp:
        max_anoms = clamp

    # -- Setup for longterm time series

    # If longterm is enabled, break the data into subset
    # data frames and store in all_data,

    if longterm_period:
        all_data = range(int(ceil(len(df["count"]) / float(longterm_period))))
        for j in range(0, len(df.timestamp), longterm_period):
            start_index = df.timestamp.iget(j)
            end_index = min((start_index + longterm_period), num_obs)
            if (end_index - start_index) == longterm_period:
                all_data[int(ceil(j / float(longterm_period)))] = df[
                    (df.timestamp >= start_index) & (df.timestamp <= end_index)
                ]
            else:
                all_data[int(ceil(j / float(longterm_period)))] = df[
                    (df.timestamp >= (num_obs - longterm_period)) & (df.timestamp <= num_obs)
                ]
    else:
        all_data = [df]

    # Create empty data frames to store all anoms and
    # seasonal+trend component from decomposition
    all_anoms = DataFrame(columns=["timestamp", "count"])
    seasonal_plus_trend = DataFrame(columns=["timestamp", "count"])

    # Detect anomalies on all data (either entire data in one-pass,
    # or in 2 week blocks if longterm=TRUE)
    for i in range(len(all_data)):
        directions = {"pos": Direction(True, True), "neg": Direction(True, False), "both": Direction(False, True)}
        anomaly_direction = directions[direction]

        s_h_esd_timestamps = detect_anoms(
            all_data[i],
            k=max_anoms,
            alpha=alpha,
            num_obs_per_period=period,
            use_decomp=True,
            use_esd=False,
            one_tail=anomaly_direction.one_tail,
            upper_tail=anomaly_direction.upper_tail,
            verbose=verbose,
        )

        # store decomposed components in local variable and
        # overwrite s_h_esd_timestamps to contain only the anom timestamps
        data_decomp = s_h_esd_timestamps["stl"]
        s_h_esd_timestamps = s_h_esd_timestamps["anoms"]

        # -- Step 3: Use detected anomaly timestamps to
        # extract the actual anomalies (timestamp and value) from the data
        if s_h_esd_timestamps:
            anoms = all_data[i][all_data[i].timestamp.isin(s_h_esd_timestamps)]
        else:
            anoms = DataFrame(columns=["timestamp", "count"])

        # Filter the anomalies using one of the thresholding
        # functions if applicable
        if threshold:
            # Calculate daily max values
            if isinstance(all_data[i].index[0], np.int64):
                group = all_data[i].timestamp.map(lambda t: t / period)
            else:
                group = all_data[i].timestamp.map(Timestamp.date)
            periodic_maxes = df.groupby(group).aggregate(np.max)["count"]

            # Calculate the threshold set by the user
            if threshold == "med_max":
                thresh = periodic_maxes.median()
            elif threshold == "p95":
                thresh = periodic_maxes.quantile(0.95)
            elif threshold == "p99":
                thresh = periodic_maxes.quantile(0.99)

            # Remove any anoms below the threshold
            anoms = anoms[anoms["count"] >= thresh]

        all_anoms = all_anoms.append(anoms)
        seasonal_plus_trend = seasonal_plus_trend.append(data_decomp)

    # Cleanup potential duplicates
    all_anoms.drop_duplicates(subset=["timestamp"])
    seasonal_plus_trend.drop_duplicates(subset=["timestamp"])

    # -- If only_last was set by the user, create subset of
    # the data that represent the most recent period
    if only_last:
        d = {"timestamp": df.timestamp.iloc[-period:], "count": df["count"].iloc[-period:]}
        x_subset_single_period = DataFrame(d, index=d["timestamp"])
        past_obs = period * 7
        if num_obs < past_obs:
            past_obs = num_obs - period
        # When plotting anoms for the last period only we only show
        # the previous 7 periods of data
        d = {"timestamp": df.iloc[:, 0].iloc[-past_obs:-period], "count": df["count"].iloc[-past_obs:-period]}
        x_subset_previous = DataFrame(d, index=d["timestamp"])
        all_anoms = all_anoms[all_anoms.timestamp >= x_subset_single_period.timestamp.iget(0)]
        num_obs = len(x_subset_single_period["count"])

    # Calculate number of anomalies as a percentage
    anom_pct = (len(df.iloc[:, 1]) / float(num_obs)) * 100

    if anom_pct == 0:
        return {"anoms": None, "plot": None}

    # The original R implementation handles plotting here.
    # Plotting is currently not implemented.
    # if plot:
    #     plot_something()

    if e_value:
        d = {
            "timestamp": all_anoms.timestamp,
            "anoms": all_anoms["count"],
            "expected_value": seasonal_plus_trend.iloc[:, 1][seasonal_plus_trend.timestamp.isin(all_anoms.timestamp)],
        }
    else:
        d = {"timestamp": all_anoms.timestamp, "anoms": all_anoms["count"]}
    anoms = DataFrame(d, index=d["timestamp"].index)

    return {"anoms": anoms, "plot": None}
Example #22
0
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name="date")
data = DataFrame(
    data.to_records(),
    columns=pd.Index(["realgdp", "infl", "unemp"], name="item"),
    index=periods.to_timestamp("D", "end"),
)

ldata = data.stack().reset_index().rename(columns={0: "value"})
wdata = ldata.pivot("date", "item", "value")

# Removing duplicates===============================
data = DataFrame({"k1": ["one"] * 3 + ["two"] * 4, "k2": [1, 1, 2, 3, 3, 4, 4]})

data.duplicated()
data.drop_duplicates()
data["v1"] = range(7)
data.drop_duplicates(["k1"])
data.drop_duplicates(["k1", "k2"], take_last=True)

# Replacing values-------------------------------
data = Series([1.0, -999.0, 2.0, -999.0, -1000.0, 3.0])
data.replace(-999, np.nan)
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])
data.replace({-999: np.nan, -1000: 0})

# Discretization and binning
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]

cats = pd.cut(ages, bins, labels=group_names)  # labels can be ommit
Example #23
0
class LocationTreeCache(object):
    """
    """

    def __init__(self):

        self.location_tree_columns = ["location_id", "parent_location_id", "lvl"]
        self.location_tree_df = DataFrame(columns=self.location_tree_columns)

    def main(self):
        """
        The loop is initiated by taking the lowest level ( village for
        instance ) and moving up throughout the tree. district > province >
        region > country etc.

        The process creates a dataframe caled location_tree_df that we continue
        to append to finally bulk inserting that information into the db.
        """

        ## iterate from bottom to bottom to top ##
        for lt_id, name, admin_level in (
            LocationType.objects.all().values_list("id", "name", "admin_level").order_by("-admin_level")
        ):

            self.process_location_tree_lvl(lt_id)

        self.add_lvl_zero_to_df()
        self.upsert_location_tree()

    def process_location_tree_lvl(self, location_type_id):
        """
        Get and process data for a particular location type ( admin level ).
        """

        lt_batch = []
        df_columns = ["location_id", "parent_location_id"]
        location_df = DataFrame(
            list(Location.objects.filter(location_type_id=location_type_id).values_list("id", "parent_location_id")),
            columns=df_columns,
        )
        location_df["lvl"] = 1  # since this is a direct parent child relation

        merged_df = location_df.merge(self.location_tree_df, left_on="location_id", right_on="parent_location_id")

        cleaned_merge_df = merged_df[["location_id_y", "parent_location_id_x", "lvl_y"]]

        cleaned_merge_df["lvl_y"] = cleaned_merge_df["lvl_y"] + 1
        cleaned_merge_df.columns = self.location_tree_columns

        self.location_tree_df = concat([self.location_tree_df, location_df, cleaned_merge_df])
        self.location_tree_df.drop_duplicates()

    def add_lvl_zero_to_df(self):
        """
        Every location should have a row for itself in this table such that
        my any location is equal to it's parent when lvl = 0.
        """

        unique_location_id_list = list(self.location_tree_df["location_id"].unique())

        zero_level_df = DataFrame([[l, l, 0] for l in unique_location_id_list], columns=self.location_tree_columns)

        self.location_tree_df = self.location_tree_df.append(zero_level_df)

    def upsert_location_tree(self):
        """
        """

        lt_batch = []

        ## Drop Duplicates; NaN --> None
        self.location_tree_df.dropna(inplace=True)

        ## iterate through the location tree df created above ##
        for ix, loc in self.location_tree_df.iterrows():
            lt_batch.append(
                LocationTree(
                    **{"location_id": loc.location_id, "parent_location_id": loc.parent_location_id, "lvl": loc.lvl}
                )
            )

        LocationTree.objects.all().delete()
        LocationTree.objects.bulk_create(lt_batch)

        ## add the ultimate parent as it will not have a record in the df yet
        ultimate_parent_id = Location.objects.filter(parent_location_id__isnull=True)[0].id

        ult_parent, created = LocationTree.objects.get_or_create(
            location_id=ultimate_parent_id, parent_location_id=ultimate_parent_id, defaults={"lvl": 0}
        )
Example #24
0
def detect_ts(
    df,
    max_anoms=0.10,
    direction="pos",
    alpha=0.05,
    only_last=None,
    threshold=None,
    e_value=False,
    longterm=False,
    piecewise_median_period_weeks=2,
    plot=False,
    y_log=False,
    xlabel="",
    ylabel="count",
    title=None,
    verbose=False,
):
    if not isinstance(df, DataFrame):
        raise ValueError("data must be a single data frame.")
    else:
        if len(df.columns) != 2 or not df.iloc[:, 1].map(np.isreal).all():
            raise ValueError(
                (
                    "data must be a 2 column data.frame, with the"
                    "first column being a set of timestamps, and "
                    "the second coloumn being numeric values."
                )
            )

        if not (df.dtypes[0].type is np.datetime64) and not (df.dtypes[0].type is np.int64):
            df = format_timestamp(df)

    if list(df.columns.values) != ["timestamp", "count"]:
        df.columns = ["timestamp", "count"]

    # Sanity check all input parameters
    if max_anoms > 0.49:
        length = len(df.iloc[:, 1])
        raise ValueError(
            ("max_anoms must be less than 50% of " "the data points (max_anoms =%f data_points =%s).")
            % (round(max_anoms * length, 0), length)
        )

    if not direction in ["pos", "neg", "both"]:
        raise ValueError("direction options are: pos | neg | both.")

    if not (0.01 <= alpha or alpha <= 0.1):
        if verbose:
            import warnings

            warnings.warn(("alpha is the statistical signifigance, " "and is usually between 0.01 and 0.1"))

    if only_last and not only_last in ["day", "hr"]:
        raise ValueError("only_last must be either 'day' or 'hr'")

    if not threshold in [None, "med_max", "p95", "p99"]:
        raise ValueError("threshold options are: None | med_max | p95 | p99")

    if not isinstance(e_value, bool):
        raise ValueError("e_value must be a boolean")

    if not isinstance(longterm, bool):
        raise ValueError("longterm must be a boolean")

    if piecewise_median_period_weeks < 2:
        raise ValueError("piecewise_median_period_weeks must be at greater than 2 weeks")

    if not isinstance(plot, bool):
        raise ValueError("plot must be a boolean")

    if not isinstance(y_log, bool):
        raise ValueError("y_log must be a boolean")

    if not isinstance(xlabel, basestring):
        raise ValueError("xlabel must be a string")

    if not isinstance(ylabel, basestring):
        raise ValueError("ylabel must be a string")

    if title and not isinstance(title, basestring):
        raise ValueError("title must be a string")

    if not title:
        title = ""
    else:
        title = title + " : "

    gran = get_gran(df)

    if gran == "day":
        num_days_per_line = 7
        if isinstance(only_last, basestring) and only_last == "hr":
            only_last = "day"
    else:
        num_days_per_line = 1

    if gran == "sec":
        df.timestamp = date_format(df.timestamp, "%Y-%m-%d %H:%M:00")
        df = format_timestamp(df.groupby("timestamp").aggregate(np.sum))

    # if the data is daily, then we need to bump
    # the period to weekly to get multiple examples
    gran_period = {"min": 1440, "hr": 24, "day": 7}
    period = gran_period[gran]
    num_obs = len(df["count"])

    clamp = 1 / float(num_obs)
    if max_anoms < clamp:
        max_anoms = clamp

    if longterm:
        if gran == "day":
            num_obs_in_period = period * piecewise_median_period_weeks + 1
            num_days_in_period = 7 * piecewise_median_period_weeks + 1
        else:
            num_obs_in_period = period * 7 * piecewise_median_period_weeks
            num_days_in_period = 7 * piecewise_median_period_weeks

        last_date = df.timestamp.iget(-1)

        all_data = range(int(ceil(len(df["count"]) / float(num_obs_in_period))))

        for j in range(0, len(df.timestamp), num_obs_in_period):
            start_date = df.timestamp.iget(j)
            end_date = min(start_date + datetime.timedelta(days=num_obs_in_period), df.timestamp.iget(-1))

            # if there is at least 14 days left, subset it,
            # otherwise subset last_date - 14days
            if (end_date - start_date).days == num_days_in_period:
                all_data[int(ceil(j / num_obs_in_period))] = df[
                    (df.timestamp >= start_date) & (df.timestamp < end_date)
                ]
            else:
                all_data[int(ceil(j / num_obs_in_period))] = df[
                    (df.timestamp > (last_date - datetime.timedelta(days=num_days_in_period)))
                    & (df.timestamp <= last_date)
                ]
    else:
        all_data = [df]

    all_anoms = DataFrame(columns=["timestamp", "count"])
    seasonal_plus_trend = DataFrame(columns=["timestamp", "count"])

    # Detect anomalies on all data (either entire data in one-pass,
    # or in 2 week blocks if longterm=TRUE)
    for i in range(len(all_data)):
        directions = {"pos": Direction(True, True), "neg": Direction(True, False), "both": Direction(False, True)}
        anomaly_direction = directions[direction]

        # detect_anoms actually performs the anomaly detection and
        # returns the results in a list containing the anomalies
        # as well as the decomposed components of the time series
        # for further analysis.

        s_h_esd_timestamps = detect_anoms(
            all_data[i],
            k=max_anoms,
            alpha=alpha,
            num_obs_per_period=period,
            use_decomp=True,
            use_esd=False,
            one_tail=anomaly_direction.one_tail,
            upper_tail=anomaly_direction.upper_tail,
            verbose=verbose,
        )

        # store decomposed components in local variable and overwrite
        # s_h_esd_timestamps to contain only the anom timestamps
        data_decomp = s_h_esd_timestamps["stl"]
        s_h_esd_timestamps = s_h_esd_timestamps["anoms"]

        # -- Step 3: Use detected anomaly timestamps to extract the actual
        # anomalies (timestamp and value) from the data
        if s_h_esd_timestamps:
            anoms = all_data[i][all_data[i].timestamp.isin(s_h_esd_timestamps)]
        else:
            anoms = DataFrame(columns=["timestamp", "count"])

        # Filter the anomalies using one of the thresholding functions if applicable
        if threshold:
            # Calculate daily max values
            periodic_maxes = df.groupby(df.timestamp.map(Timestamp.date)).aggregate(np.max)["count"]

            # Calculate the threshold set by the user
            if threshold == "med_max":
                thresh = periodic_maxes.median()
            elif threshold == "p95":
                thresh = periodic_maxes.quantile(0.95)
            elif threshold == "p99":
                thresh = periodic_maxes.quantile(0.99)

            # Remove any anoms below the threshold
            anoms = anoms[anoms["count"] >= thresh]

        all_anoms = all_anoms.append(anoms)
        seasonal_plus_trend = seasonal_plus_trend.append(data_decomp)

    # Cleanup potential duplicates
    all_anoms.drop_duplicates(subset=["timestamp"])
    seasonal_plus_trend.drop_duplicates(subset=["timestamp"])

    # -- If only_last was set by the user,
    # create subset of the data that represent the most recent day
    if only_last:
        start_date = df.timestamp.iget(-1) - datetime.timedelta(days=7)
        start_anoms = df.timestamp.iget(-1) - datetime.timedelta(days=1)
        if gran is "day":
            breaks = 3 * 12
            num_days_per_line = 7
        else:
            if only_last == "day":
                breaks = 12
            else:
                start_date = df.timestamp.iget(-1) - datetime.timedelta(days=2)
                # truncate to days
                start_date = datetime.date(start_date.year, start_date.month, start_date.day)
                start_anoms = df.timestamp.iget(-1) - datetime.timedelta(hours=1)
                breaks = 3

        # subset the last days worth of data
        x_subset_single_day = df[df.timestamp > start_anoms]
        # When plotting anoms for the last day only
        # we only show the previous weeks data
        x_subset_week = df[(df.timestamp <= start_anoms) & (df.timestamp > start_date)]
        if len(all_anoms) > 0:
            all_anoms = all_anoms[all_anoms.timestamp >= x_subset_single_day.timestamp.iget(0)]
        num_obs = len(x_subset_single_day["count"])

    # Calculate number of anomalies as a percentage
    anom_pct = (len(df.iloc[:, 1]) / float(num_obs)) * 100

    if anom_pct == 0:
        return {"anoms": None, "plot": None}

    # The original R implementation handles plotting here.
    # Plotting is currently not implemented in this version.
    # if plot:
    #     plot_something()

    #    print 'wtf'
    #    print all_anoms

    if e_value:
        d = {
            "timestamp": all_anoms.timestamp,
            "anoms": all_anoms["count"],
            "expected_value": seasonal_plus_trend.iloc[:, 1][seasonal_plus_trend.timestamp.isin(all_anoms.timestamp)],
        }
    else:
        d = {"timestamp": all_anoms.timestamp, "anoms": all_anoms["count"]}
    anoms = DataFrame(d, index=d["timestamp"].index)

    return {"anoms": anoms, "plot": None}
    return sum(IP_Weight_List)


def get_consumer_weights(browser_id1, browser_id2):
    return float(data.loc[int(browser_id1), browser_id2])


# def get_consumer_list_weights(consumers_per_ip_count, browser_id_mappings ):


data2 = df.from_csv("sample.csv", header=0, index_col=None)

sample = df(data2, columns=["IP", "alt_Browser", "Consumer"])
# print sample
unique_consumers = df.drop_duplicates(df(sample.Consumer))


def recursive_add_consumers(consumer_id, seen=set([])):
    if consumer_id is None:
        return

    seen.add(consumer_id)
    consumer_key = sample[sample.Consumer == consumer_id]
    IP = df.drop_duplicates(df(consumer_key.IP))

    n = np.array(np.arange(len(IP)))

    IP_Map = set([])
    for i in n:
        value = sample[sample.IP.isin([IP.iloc[i, 0]])]
Example #26
0
from pandas import DataFrame, Series
import pandas as pd
import numpy as np

data = DataFrame({"k1": ["one"] * 3 + ["two"] * 4, "k2": [1, 1, 2, 3, 3, 4, 4]})

print(data)
print(data.duplicated())
print(data.drop_duplicates())

data["v1"] = range(7)
print(data.drop_duplicates(["k1"]))
print(data.drop_duplicates(["k1", "k2"], keep="last"))

data = DataFrame(
    {
        "food": [
            "bacon",
            "pulled pork",
            "bacon",
            "Pastrami",
            "corned beef",
            "Bacon",
            "Pastrami",
            "honey ham",
            "nova lox",
        ],
        "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6],
    }
)
Example #27
0
def detect_ts(
    df,
    max_anoms=0.10,
    direction="pos",
    alpha=0.05,
    threshold=None,
    e_value=False,
    longterm=False,
    piecewise_median_period_weeks=2,
    granularity="day",
    verbose=False,
    inplace=True,
):
    """
    Anomaly Detection Using Seasonal Hybrid ESD Test
    A technique for detecting anomalies in seasonal univariate time series where the input is a
    series of <timestamp, value> pairs.

    Args:

    x: Time series as a two column data frame where the first column consists of the integer UTC Unix
    timestamps and the second column consists of the observations.

    max_anoms: Maximum number of anomalies that S-H-ESD will detect as a percentage of the
    data.

    direction: Directionality of the anomalies to be detected. Options are: ('pos' | 'neg' | 'both').

    alpha: The level of statistical significance with which to accept or reject anomalies.

    only_last: Find and report anomalies only within the last day or hr in the time series. Options: (None | 'day' | 'hr')

    threshold: Only report positive going anoms above the threshold specified. Options are: (None | 'med_max' | 'p95' | 'p99')

    e_value: Add an additional column to the anoms output containing the expected value.

    longterm: Increase anom detection efficacy for time series that are greater than a month.

    See Details below.
    piecewise_median_period_weeks: The piecewise median time window as described in Vallis, Hochenbaum, and Kejariwal
    (2014). Defaults to 2.

    Details


    'longterm' This option should be set when the input time series is longer than a month.
    The option enables the approach described in Vallis, Hochenbaum, and Kejariwal (2014).
    'threshold' Filter all negative anomalies and those anomalies whose magnitude is smaller
    than one of the specified thresholds which include: the median
    of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the
    99th percentile of the daily max values (p99).

    The returned value is a dictionary with the following components:
      anoms: Data frame containing timestamps, values, and optionally expected values.
      plot: A graphical object if plotting was requested by the user. The plot contains
      the estimated anomalies annotated on the input time series
    """

    if not isinstance(df, DataFrame):
        raise ValueError("data must be a single data frame.")
    else:
        if len(df.columns) != 2 or not df.iloc[:, 1].map(np.isreal).all():
            raise ValueError(
                """data must be a 2 column data.frame, with the first column being a set of timestamps, and
                                the second coloumn being numeric values."""
            )

        if not (df.dtypes[0].type is np.float64) and not (df.dtypes[0].type is np.int64):
            raise ValueError(
                """The input timestamp column must be a float or integer of the unix timestamp, not date
                                time columns, date strings or pd.TimeStamp columns."""
            )

    if not inplace:
        df = copy.deepcopy(df)

    # change the column names in place, rather than copying the entire dataset, but save the headers to replace them.
    orig_header = df.columns.values
    df.rename(columns={df.columns.values[0]: "timestamp", df.columns.values[1]: "value"}, inplace=True)

    # Sanity check all input parameters
    if max_anoms > 0.49:
        length = len(df.value)
        raise ValueError(
            "max_anoms must be less than 50%% of the data points (max_anoms =%f data_points =%s)."
            % (round(max_anoms * length, 0), length)
        )

    if direction not in ["pos", "neg", "both"]:
        raise ValueError("direction options are: pos | neg | both.")

    if not (0.01 <= alpha or alpha <= 0.1):
        if verbose:
            import warnings

            warnings.warn("alpha is the statistical signifigance, and is usually between 0.01 and 0.1")

    if threshold not in [None, "med_max", "p95", "p99"]:
        raise ValueError("threshold options are: None | med_max | p95 | p99")

    if not isinstance(e_value, bool):
        raise ValueError("e_value must be a boolean")

    if not isinstance(longterm, bool):
        raise ValueError("longterm must be a boolean")

    if piecewise_median_period_weeks < 2:
        raise ValueError("piecewise_median_period_weeks must be at greater than 2 weeks")

    # if the data is daily, then we need to bump the period to weekly to get multiple examples
    gran = granularity
    gran_period = {"ms": 60000, "sec": 3600, "min": 1440, "hr": 24, "day": 7}
    period = gran_period.get(gran)
    if not period:
        raise ValueError("%s granularity detected. This is currently not supported." % (gran,))

    # now convert the timestamp column into a proper timestamp
    df["timestamp"] = df["timestamp"].map(lambda x: datetime.datetime.utcfromtimestamp(x))

    num_obs = len(df.value)

    clamp = 1 / float(num_obs)
    if max_anoms < clamp:
        max_anoms = clamp

    if longterm:
        if gran == "day":
            num_obs_in_period = period * piecewise_median_period_weeks + 1
            num_days_in_period = 7 * piecewise_median_period_weeks + 1
        else:
            num_obs_in_period = period * 7 * piecewise_median_period_weeks
            num_days_in_period = 7 * piecewise_median_period_weeks

        last_date = df.timestamp.iget(-1)

        all_data = []

        for j in range(0, len(df.timestamp), num_obs_in_period):
            start_date = df.timestamp.iget(j)
            end_date = min(start_date + datetime.timedelta(days=num_obs_in_period), df.timestamp.iget(-1))

            # if there is at least 14 days left, subset it, otherwise subset last_date - 14days
            if (end_date - start_date).days == num_days_in_period:
                sub_df = df[(df.timestamp >= start_date) & (df.timestamp < end_date)]
            else:
                sub_df = df[
                    (df.timestamp > (last_date - datetime.timedelta(days=num_days_in_period)))
                    & (df.timestamp <= last_date)
                ]
            all_data.append(sub_df)
    else:
        all_data = [df]

    all_anoms = DataFrame(columns=["timestamp", "value"])
    seasonal_plus_trend = DataFrame(columns=["timestamp", "value"])

    # Detect anomalies on all data (either entire data in one-pass, or in 2 week blocks if longterm=TRUE)
    for i in range(len(all_data)):
        directions = {"pos": Direction(True, True), "neg": Direction(True, False), "both": Direction(False, True)}
        anomaly_direction = directions[direction]

        # detect_anoms actually performs the anomaly detection and returns the result in a list containing the anomalies
        # as well as the decomposed components of the time series for further analysis.

        s_h_esd_timestamps = detect_anoms(
            all_data[i],
            k=max_anoms,
            alpha=alpha,
            num_obs_per_period=period,
            use_decomp=True,
            one_tail=anomaly_direction.one_tail,
            upper_tail=anomaly_direction.upper_tail,
            verbose=verbose,
        )
        if s_h_esd_timestamps is None:
            return {"anoms": DataFrame(columns=["timestamp", "anoms"])}

        # store decomposed comps in local variable and overwrite s_h_esd_timestamps to contain only the anom timestamps
        data_decomp = s_h_esd_timestamps["stl"]
        s_h_esd_timestamps = s_h_esd_timestamps["anoms"]

        # -- Step 3: Use detected anomaly timestamps to extract the actual anomalies (timestamp and value) from the data
        if s_h_esd_timestamps:
            anoms = all_data[i][all_data[i].timestamp.isin(s_h_esd_timestamps)]
        else:
            anoms = DataFrame(columns=["timestamp", "value"])

        # Filter the anomalies using one of the thresholding functions if applicable
        if threshold:
            # Calculate daily max values
            periodic_maxes = df.groupby(df.timestamp.map(Timestamp.date)).aggregate(np.max).value

            # Calculate the threshold set by the user
            thresh = 0.5
            if threshold == "med_max":
                thresh = periodic_maxes.median()
            elif threshold == "p95":
                thresh = periodic_maxes.quantile(0.95)
            elif threshold == "p99":
                thresh = periodic_maxes.quantile(0.99)

            # Remove any anoms below the threshold
            anoms = anoms[anoms.value >= thresh]

        all_anoms = all_anoms.append(anoms)
        seasonal_plus_trend = seasonal_plus_trend.append(data_decomp)

    # Cleanup potential duplicates
    try:
        all_anoms.drop_duplicates(subset=["timestamp"])
        seasonal_plus_trend.drop_duplicates(subset=["timestamp"])
    except TypeError:
        all_anoms.drop_duplicates(cols=["timestamp"])
        seasonal_plus_trend.drop_duplicates(cols=["timestamp"])

    # Calculate number of anomalies as a percentage
    anom_pct = (len(df.value) / float(num_obs)) * 100

    # name the columns back
    df.rename(columns={"timestamp": orig_header[0], "value": orig_header[1]}, inplace=True)

    if anom_pct == 0:
        return {"anoms": None}

    all_anoms.index = all_anoms.timestamp

    if e_value:
        d = {
            "timestamp": all_anoms.timestamp,
            "anoms": all_anoms.value,
            "expected_value": seasonal_plus_trend[seasonal_plus_trend.timestamp.isin(all_anoms.timestamp)].value,
        }
    else:
        d = {"timestamp": all_anoms.timestamp, "anoms": all_anoms.value}

    anoms = DataFrame(d, index=d["timestamp"].index)

    # convert timestamps back to unix time
    anoms["timestamp"] = anoms["timestamp"].astype(np.int64)
    anoms["timestamp"] = anoms["timestamp"].map(lambda x: x * 10e-10)

    return {"anoms": anoms}
Example #28
0
class GetHoldingsData(object):
    # def __init__(self, cik, quarter_date = None, filing_date = '2200-01-01'):
    def __init__(self, cik, quarter_date=None, filing_date=None):
        self.cik = cik
        if not quarter_date and not filing_date:
            raise TypeError("must include at least quarter_date or filing_date")
        self.quarter_date = quarter_date
        self.filing_date = filing_date

    def get_portfolio(self):
        self.form_list = self.get_forms()
        self.parse_form_list()
        # [filingdate, accession, type, quarterdate]
        self.pull_holdings()
        self.get_portfolio_weights()
        self.clean_putcall_values()
        return self.holdings

    def clean_putcall_values(self):
        self.holdings["putcall"] = self.holdings["putcall"].fillna(value="Long")

    def get_forms(self):
        sql_query, sql_parameters = self.generate_sql_query()
        conn = start_db_connection(DB_CONNECTION_TYPE)
        with closing(conn.cursor()) as cur:
            cur.execute(sql_query, sql_parameters)
            form_list = cur.fetchall()
        conn.close()
        return form_list

    def generate_sql_query(self):
        sql_query = """SELECT filingdate, accessionnunber, filingtype, quarterdate
                       FROM form13flist WHERE cik=%s"""
        parameters = [self.cik]
        if self.quarter_date:
            sql_query += " and quarterdate = %s"
            parameters.append(self.quarter_date)
        if self.filing_date:
            sql_query += " and filingdate <= %s"
            parameters.append(self.filing_date)
        return sql_query, tuple(parameters)

    def parse_form_list(self):
        if not self.quarter_date:
            no_ammendments = [x for x in self.form_list if x[2] == "13F-HR"]
            self.quarter_date = max(no_ammendments, key=itemgetter(3))[3]
            self.form_list = [x for x in self.form_list if x[3] == self.quarter_date]
        # Top sort is only important when 13F and 13F-A are on the same day
        # which happened 1503174, 2015-5-15
        self.form_list.sort(key=lambda x: x[2])
        self.form_list.sort(key=lambda x: x[0])

    def pull_holdings(self):
        sql_fields = (
            "CUSIPList.Ticker, {0}.CUSIP, {0}.nameOfIssuer,"
            "{0}.titleOfClass, {0}.value, {0}.sshPrnamt,{0}.sshPrnamtType,"
            "{0}.putCall,{0}.investmentDiscretion, {0}.Sole, {0}.Shared,"
            "{0}.None".format("form13fholdings")
        )
        sql_query = """SELECT {} from cusiplist INNER JOIN form13fholdings
                    ON cusiplist.cusip=form13fholdings.cusip WHERE
                    form13fholdings.accessionnunber = %(an)s""".format(
            sql_fields
        )

        engine = start_engine(DB_CONNECTION_TYPE)
        self.holdings = DataFrame([])
        for form in self.form_list:
            accession_nunber = form[1]
            df = pd.read_sql_query(sql_query, engine, params={"an": accession_nunber})
            if self.holdings.empty and form[2] == "13F-HR":
                self.holdings = df
            elif self.holdings.any and form[2] == "13F-HR/A":
                self.incorporate_ammendment(df)
            else:
                raise Exception("Not supposed to be here Holdinganalysis")

    # Logic is that if ammendment has new stuff add it on, otherwise replace
    # existing entry.
    def incorporate_ammendment(self, ammendment):
        self.holdings = pd.concat([self.holdings, ammendment])
        # since take_last==True will keep the ammendment values
        self.holdings.drop_duplicates(subset=["ticker", "putcall"], take_last=True, inplace=True)

    def get_portfolio_weights(self):
        total_weight = self.holdings["value"].sum(axis=0)
        weight_func = lambda x: x / float(total_weight)
        self.holdings["weight"] = self.holdings["value"].map(weight_func)
Example #29
0
 def convertCodesDatesToDF(self, pdbsAndDates):
     df = DataFrame(pdbsAndDates, columns=["pdb", "date"])
     df["date"] = to_datetime(df["date"])
     df = df[notnull(df["pdb"])]
     df = df.drop_duplicates()
     return df
def famille(year=2006):
    ### On suit la méthode décrite dans le Guide ERF_2002_rétropolée page 135
    #
    # TODO: appeler un fichier de paramètres de législation
    if year == 2006:
        smic = 1254
    elif year == 2007:
        smic = 1280
    elif year == 2008:
        smic = 1308
    elif year == 2009:
        smic = 1337
    else:
        print ("smic non défini")

    ## TODO check if we can remove acteu forter etc since dealt with in 01_pre_proc
    #
    # indivi <- LoadIn(indm,indVar)

    #    indivi = erf_indivi.merge(eec_indivi)
    #
    print "Etape 1 : préparation de base"
    print "    1.1 : récupération de indivi"
    indivi = load_temp(name="indivim", year=year)

    indivi["year"] = year
    indivi["noidec"] = indivi["declar1"].apply(lambda x: str(x)[0:2])
    indivi["agepf"] = where(indivi["naim"] < 7, indivi["year"] - indivi["naia"], indivi["year"] - indivi["naia"] - 1)

    indivi = indivi[~((indivi["lien"] == 6) & (indivi["agepf"] < 16) & ("quelfic" == "EE"))]

    print "    1.2 : récupération des enfants à naître"
    indVar = [
        "noi",
        "noicon",
        "noindiv",
        "noiper",
        "noimer",
        "ident",
        "declar1",
        "naia",
        "naim",
        "lien",
        "quelfic",
        "acteu",
        "stc",
        "contra",
        "titc",
        "mrec",
        "forter",
        "rstg",
        "retrai",
        "lpr",
        "cohab",
        "ztsai",
        "sexe",
        "persfip",
        "agepr",
        "rga",
        "actrec",
        "agepf",
        "noidec",
        "year",
    ]

    enfnn = load_temp(name="enfnn", year=year)
    enfnn = enfnn[indVar]  # NOTE: la moitié des colonnes est remplie avec des NaN

    enfnn = enfnn.drop_duplicates("noindiv")
    print "nb enfants à naitre", len(enfnn.index)
    print "On enlève les enfants à naitre qui ne sont pas les enfants de la personne de référence"
    enfnn = enfnn[enfnn["lpr"] == 3]
    enfnn = enfnn[~(enfnn.noindiv.isin(indivi.noindiv.values))]
    print len(enfnn.index)

    # # # PB with vars "agepf"  "noidec" "year"  NOTE: quels problèmes ? JS
    # # base <- rbind(indivi,enfnn)
    # # setdiff(names(indivi),names(enfnn))
    # #

    print "    1.3 : création de base"
    base = concat([indivi, enfnn])
    print "length of base", len(base.index)

    base["noindiv"] = 100 * base["ident"] + base["noi"]
    base["m15"] = base["agepf"] < 16
    base["p16m20"] = (base["agepf"] >= 16) & (base["agepf"] <= 20)
    base["p21"] = base["agepf"] >= 21
    base["ztsai"] = where(base["ztsai"] is None, 0, base["ztsai"])
    base["smic55"] = base["ztsai"] >= smic * 12 * 0.55  ##55% du smic mensuel brut
    base["famille"] = 0
    base["kid"] = False
    print base.smic55.describe()

    def control_04(dataframe):
        print "longueur de la dataframe après opération =", len(dataframe.index)
        dup = dataframe.duplicated(cols="noindiv")
        print "contrôle des doublons =>", any(dup == True)  # dup.describe()
        print "contrôle des colonnes ->", len(dataframe.columns)
        print "nombre de familles différentes", len(set(famille.noifam.values))
        print "contrôle noifam is null:", len(dataframe[dataframe["noifam"].isnull()])
        if len(dataframe.index) > len(base.index):
            raise Exception("too many rows compared to base")

    # # message('Etape 1: On cherche les enfants ayant père et/ou mère')
    # # pr <- subset(base,lpr==1,c('ident','noi'))
    # # pr$noifam <- 100*pr$ident + pr$noi
    # # pr <- pr[c('ident','noifam')]
    # #
    # # nof01 <- subset(base,(lpr %in% c(1,2) )|(lpr==3 & m15) | (lpr==3 & (p16m20 & !smic55) ))
    # # nof01 <- merge(pr,nof01,by ='ident')
    # # nof01 <- within(nof01,{
    # #   famille <- 10
    # #   kid <-(lpr==3 & m15) | (lpr==3 & (p16m20 & !smic55 ) )
    # #   })
    # # famille <- nof01

    print ""
    print "Etape 2 : On cherche les enfants ayant père et/ou mère"

    pr = base[base["lpr"] == 1].loc[:, ["ident", "noi"]]
    pr["noifam"] = 100 * pr["ident"] + pr["noi"]
    pr = pr.loc[:, ["ident", "noifam"]]
    print "length pr", len(pr.index)

    nof01 = base[
        (base.lpr.isin([1, 2]))
        | ((base["lpr"] == 3) & (base["m15"]))
        | ((base["lpr"] == 3) & (base["p16m20"]) & (~base["smic55"]))
    ]
    print "longueur de nof01 avant merge", len(nof01.index)
    nof01 = nof01.merge(pr, on="ident", how="outer")
    nof01["famille"] = 10
    nof01["kid"] = ((nof01["lpr"] == 3) & (nof01["m15"])) | (
        (nof01["lpr"] == 3) & (nof01["p16m20"]) & ~(nof01["smic55"])
    )
    famille = nof01
    print nof01["kid"].value_counts()
    print nof01.lpr.value_counts()

    del nof01
    control_04(famille)

    print "    2.1 : identification des couples"
    # l'ID est le noi de l'homme
    hcouple = subset_base(base, famille)
    hcouple = hcouple[(hcouple["cohab"] == 1) & (hcouple["lpr"] >= 3) & (hcouple["sexe"] == 1)]
    hcouple["noifam"] = 100 * hcouple["ident"] + hcouple["noi"]
    hcouple["famille"] = 21
    print "longueur hcouple", len(hcouple.index)

    # # message('Etape 2b')
    # # fcouple<- base[!base$noindiv %in% famille$noindiv,]
    # # fcouple <- subset(fcouple,(cohab==1) & (lpr>=3) & (sexe==2))
    # # fcouple <- within(fcouple,{
    # #     noifam <- 100*ident + noicon ## l'identifiant est le conjoint du ménage  */
    # #     famille <- 22 })
    # #
    # # famcom<- merge(fcouple['noifam'],hcouple['noifam'])
    # # fcouple <- merge(famcom,fcouple)
    # #
    # # famille <- rbind(famille,hcouple,fcouple)

    print "    2.2 : attributing the noifam to the wives"
    fcouple = base[~(base.noindiv.isin(famille.noindiv.values))]
    fcouple = fcouple[(fcouple["cohab"] == 1) & (fcouple["lpr"] >= 3) & (fcouple["sexe"] == 2)]
    fcouple["noifam"] = 100 * fcouple["ident"] + fcouple["noi"]
    fcouple["famille"] = 22
    print "longueur fcouple", len(fcouple.index)

    famcom = fcouple.merge(hcouple, on="noifam", how="outer")
    print "longueur fancom après fusion", len(famcom.index)
    fcouple = fcouple.merge(famcom)  # NOTE : faire un inner merge sinon présence de doublons
    print "longueur fcouple après fusion", len(fcouple.index)

    famille = concat([famille, hcouple, fcouple], join="inner")
    control_04(famille)

    print ""
    print "Etape 3: Récupération des personnes seules"
    print "    3.1 : personnes seules de catégorie 1"
    seul1 = base[~(base.noindiv.isin(famille.noindiv.values))]
    seul1 = seul1[
        (seul1.lpr.isin([3, 4]))
        & ((seul1["p16m20"] & seul1["smic55"]) | seul1["p21"])
        & (seul1["cohab"] == 1)
        & (seul1["sexe"] == 2)
    ]
    if len(seul1.index) > 0:
        seul1["noifam"] = 100 * seul1["ident"] + seul1["noi"]
        seul1["famille"] = 31
        famille = concat([famille, seul1])
    print len(seul1.index)
    control_04(famille)

    # # message('  3.2 personnes seules 2')
    # # seul2 <- base[(!base$noindiv %in% famille$noindiv),]
    # # seul2 <- subset(seul2,(lpr %in% c(3,4)) & p16m20 & smic55 & (cohab!=1))
    # # seul2 <- within(seul2,{noifam <- 100*ident+noi
    # #                      famille <- 32})
    # # famille <- rbind(famille,seul2)
    print "    3.1 personnes seules de catégorie 2"
    seul2 = base[~(base.noindiv.isin(famille.noindiv.values))]
    seul2 = seul2[(seul2.lpr.isin([3, 4])) & seul2["p16m20"] & seul2["smic55"] & (seul2["cohab"] != 1)]
    seul2["noifam"] = 100 * seul2["ident"] + seul2["noi"]
    seul2["famille"] = 32
    famille = concat([famille, seul2])
    control_04(famille)

    # # message(' 3.3 personnes seules 3')
    # # seul3 <- base[(!base$noindiv %in% famille$noindiv),]
    # # seul3 <- subset(seul3,(lpr %in% c(3,4)) & p21 & cohab!=1)
    # #     ## TODO CHECK erreur dans le guide méthodologique ERF 2002 lpr 3,4 au lieu de 3 seulement */
    # # seul3 <- within(seul3,{noifam=100*ident+noi
    # #                          famille = 33})
    # # famille <- rbind(famille,seul3)

    print "    3.3 personnes seules de catégorie 3"
    seul3 = subset_base(base, famille)
    seul3 = seul3[(seul3.lpr.isin([3, 4])) & seul3["p21"] & (seul3["cohab"] != 1)]
    seul3["noifam"] = 100 * seul3["ident"] + seul3["noi"]
    seul3["famille"] = 33
    famille = concat([famille, seul3])
    control_04(famille)

    # # message(' 3.4 personnes seules 4')
    # # seul4 <- base[(!base$noindiv %in% famille$noindiv),]
    # # seul4 <- subset(seul4,(lpr==4) & p16m20 & !smic55 & noimer==0 & noiper==0 & persfip=="vous")
    # #
    # # if (nrow(seul4) >0 ) {  # 2006, 2009 pas de personne seule (sans enfant fip)
    # #   seul4 <- within(seul4,{noifam = 100*ident + noi
    # #                          famille = 34})
    # # }
    # #
    # # famille <- rbind(famille,seul4)

    print "    3.4 : personnes seules de catégorie 4"
    seul4 = subset_base(base, famille)
    seul4 = seul4[
        (seul4["lpr"] == 4)
        & seul4["p16m20"]
        & ~(seul4["smic55"])
        & (seul4["noimer"] == 0)
        & (seul4["persfip"] == "vous")
    ]

    if len(seul4.index) > 0:
        seul4["noifam"] = 100 * seul4["ident"] + seul4["noi"]
        seul4["famille"] = 34
        famille = concat([famille, seul4])
    control_04(famille)

    # # message('Etape 4')
    # # message(' 4.1 enfant avec mère')
    # # avec_mere <- base[(!base$noindiv %in% famille$noindiv),]
    # # avec_mere <- subset(avec_mere,((lpr=4) & ( (p16m20=1) | (m15=1))) & noimer!=0)
    # #
    # # avec_mere <- within(avec_mere,{noifam=100*ident + noimer
    # #              famille=41
    # #              kid=TRUE})
    # #
    # # ## on récupère les mères */
    # # mereid <- upData(avec_mere['noifam'], rename = c(noifam = 'noindiv'));
    # # mereid <- unique(mereid)
    # #
    # # mere <- merge(mereid,base)
    # # mere <- within(mere,{noifam=100*ident + noi
    # #                      famille=42})
    # # # TODO il y a deux mères qui ne sont pas dans les individus (problème des conjoints fip ? MBJ ne comprends pas) :
    # # dim(mereid)
    # # dim(mere)
    # # # TODO on préfère  donc enlever leurs enfants
    # # avec_mere <- avec_mere[avec_mere$noifam %in% mere$noifam,]
    # #
    print ""
    print "Etape 4 : traitement des enfants"
    print "    4.1 : enfant avec mère"
    avec_mere = subset_base(base, famille)
    avec_mere = avec_mere[
        ((avec_mere["lpr"] == 4) & ((avec_mere["p16m20"] == 1) | (avec_mere["m15"] == 1)) & (avec_mere["noimer"] != 0))
    ]
    avec_mere["noifam"] = 100 * avec_mere["ident"] + avec_mere["noimer"]
    avec_mere["famille"] = 41
    avec_mere["kid"] = True

    # On récupère les mères des enfants
    mereid = DataFrame(avec_mere["noifam"])
    mereid.columns = ["noindiv"]
    mereid = mereid.drop_duplicates()

    mere = mereid.merge(base)
    mere["noifam"] = 100 * mere["ident"] + mere["noi"]
    mere["famille"] = 42  # H2G2 nous voilà
    avec_mere = avec_mere[avec_mere.noifam.isin(mere.noifam.values)]
    print "contrôle df mère"
    control_04(mere)

    # # conj_mere <- merge(conj_mereid,base)
    # # conj_mere$famille <- 43
    # #
    # # famille <- famille[(!famille$noindiv %in% mere$noindiv),]
    # #
    # # ## on récupère les conjoints des mères */
    # # conj_mereid <- mere[mere$noicon!=0,c('ident','noicon','noifam')]
    # #
    # # conj_mereid$noindiv = 100*conj_mereid$ident + conj_mereid$noicon
    # # conj_mereid <- conj_mereid[c('noindiv','noifam')]
    # #
    # # conj_mere <- merge(conj_mereid,base)
    # # conj_mere$famille <- 43
    # #
    # # famille <- famille[(!famille$noindiv %in% conj_mere$noindiv),]
    # # famille <- rbind(famille,avec_mere,mere,conj_mere)
    # #

    famille = famille[~(famille.noindiv.isin(mere.noindiv.values))]
    control_04(famille)

    # on retrouve les conjoints des mères
    conj_mereid = mere[mere["noicon"] != 0].loc[:, ["ident", "noicon", "noifam"]]
    conj_mereid["noindiv"] = 100 * conj_mereid["ident"] + conj_mereid["noicon"]
    conj_mereid = conj_mereid.loc[:, ["noindiv", "noifam"]]
    conj_mereid = conj_mereid.merge(base)
    control_04(conj_mereid)

    conj_mere = conj_mereid.merge(base)
    conj_mere["famille"] = 43

    famille = famille[~(famille.noindiv.isin(conj_mere.noindiv.values))]
    famille = concat([famille, avec_mere, mere, conj_mere])
    control_04(famille)
    del avec_mere, mere, conj_mere, mereid, conj_mereid

    # # message(' 4.2 enfants avec père')
    # # avec_pere <- base[(!base$noindiv %in% famille$noindiv),]
    # # avec_pere <- subset(avec_pere,((lpr=4) & ( (p16m20=1) | (m15=1))) & noiper!=0)
    # # avec_pere <- within(avec_pere,{noifam=100*ident + noiper
    # #              famille=44
    # #              kid=TRUE})
    # #
    # # ## on récupère les pères  pour leur attribuer une famille propre */
    # # pereid <- upData(avec_pere['noifam'], rename = c(noifam = 'noindiv'));
    # # pereid <- unique(pereid)
    # # pere <- merge(pereid,base)
    # # pere <- within(pere,{noifam=100*ident + noi
    # #                        famille=45})
    # #
    # # famille <- famille[(!famille$noindiv %in% pere$noindiv),]
    # #
    # # ## on récupère les conjoints des pères */
    # # conj_pereid <- pere[pere$noicon!=0,c('ident','noicon','noifam')]
    # # conj_pereid$noindiv = 100*conj_pereid$ident + conj_pereid$noicon
    # # conj_pereid <- conj_pereid[c('noindiv','noifam')]
    # #
    # # conj_pere <- merge(conj_pereid,base)
    # # if (nrow(conj_pere) >0) conj_pere$famille <- 46
    # # # 2006: erreur pas de conjoint de père ?
    # #
    # # famille <- famille[(!famille$noindiv %in% conj_pere$noindiv),]
    # # famille <- rbind(famille,avec_pere,pere,conj_pere)

    print "    4.2 : enfants avec père"
    avec_pere = subset_base(base, famille)
    avec_pere = avec_pere[
        (avec_pere["lpr"] == 4)
        & ((avec_pere["p16m20"] == 1) | (avec_pere["m15"] == 1))
        & (avec_pere["noiper"].notnull())
    ]
    avec_pere["noifam"] = 100 * avec_pere["ident"] + avec_pere["noiper"]
    avec_pere["famille"] = 44
    avec_pere["kid"] = True
    print "presence of NaN in avec_pere ?", avec_pere["noifam"].isnull().any()

    pereid = DataFrame(avec_pere["noifam"])
    pereid.columns = ["noindiv"]
    pereid = pereid.drop_duplicates()
    pere = base.merge(pereid, on="noindiv", how="inner")

    pere["noifam"] = 100 * pere["ident"] + pere["noi"]
    pere["famille"] = 45
    famille = famille[~(famille.noindiv.isin(pere.noindiv.values))]

    # On récupère les conjoints des pères
    conj_pereid = pere.loc[array(pere["noicon"] != 0), ["ident", "noicon", "noifam"]]
    conj_pereid["noindiv"] = 100 * conj_pereid["ident"] + conj_pereid["noicon"]
    conj_pereid = conj_pereid.loc[:, ["noindiv", "noifam"]]

    conj_pere = base.merge(conj_pereid, on=["noindiv"], how="inner")
    control_04(conj_pere)
    if len(conj_pere.index) > 0:
        conj_pere["famille"] = 46

    famille = famille[~(famille.noindiv.isin(conj_pere.noindiv.values))]
    famille = concat([famille, avec_pere, pere, conj_pere])
    print "contrôle de famille après ajout des pères"
    control_04(famille)
    del avec_pere, pere, pereid, conj_pere, conj_pereid

    # # ##* 42. enfants avec déclarant */
    # # avec_dec <- base[(!base$noindiv %in% famille$noindiv),]
    # # avec_dec <- subset(avec_dec,(persfip=="pac") & (lpr=4) &  ( (p16m20&!smic55) | (m15=1 )))
    # # avec_dec <- within(avec_dec,{noifam = 100*ident + noidec
    # #             famille=47
    # #             kid=TRUE})
    # #
    # # ## on récupère les déclarants pour leur attribuer une famille propre */
    # # decid <- upData(avec_dec['noifam'], rename = c(noifam = 'noindiv'));
    # # decid <- unique(decid)
    # #
    # # dec <- merge(decid,base)
    # # dec <- within(dec,{noifam=100*ident + noi
    # #                    famille=48})
    # #
    # # famille <- famille[(!famille$noindiv %in% dec$noindiv),]
    # # famille <- rbind(famille,avec_dec,dec)

    print "    4.3 : enfants avec déclarant"
    avec_dec = subset_base(base, famille)
    avec_dec = avec_dec[
        (avec_dec["persfip"] == "pac")
        & (avec_dec["lpr"] == 4)
        & ((avec_dec["p16m20"] & ~(avec_dec["smic55"])) | (avec_dec["m15"] == 1))
    ]
    avec_dec["noifam"] = 100 * avec_dec["ident"] + avec_dec["noidec"].astype("float")
    avec_dec["famille"] = 47
    avec_dec["kid"] = True
    control_04(avec_dec)

    # on récupère les déclarants pour leur attribuer une famille propre
    decid = DataFrame(avec_dec["noifam"])
    decid.columns = ["noindiv"]
    decid = decid.drop_duplicates()
    dec = base.merge(decid, how="inner")
    dec["noifam"] = 100 * dec["ident"] + dec["noi"]
    dec["famille"] = 48

    famille = famille[~(famille.noindiv.isin(dec.noindiv.values))]
    famille = concat([famille, avec_dec, dec])
    del dec, decid, avec_dec
    control_04(famille)

    # # ## famille etape 5 : enfants fip */
    # # message('Etape 5 : enfants fip')
    # # # On rajoute les enfants fip
    # # # (on le fait ici pour que cela n'interfère pas avec les recherches précédentes)
    # # fip <- LoadIn(fipDat)
    # #
    # # indVar = c('noi','noicon','noindiv','noiper','noimer','ident','declar1','naia','naim','lien','quelfic','acteu','stc','contra','titc','mrec',
    # #             'forter','rstg','retrai','lpr','cohab','ztsai','sexe','persfip','agepr','rga')
    # #
    # # fip <- fip[c(indVar,'actrec','agepf','noidec','year')]
    # #
    # # table(duplicated(fip$noindiv))
    # #
    # # ## Variables auxilaires présentes dans base qu'il faut rajouter aux fip'
    # # ## WARNING les noindiv des fip sont construits sur les ident des déclarants
    # # ## pas d'orvelap possible avec les autres noindiv car on a des noi =99, 98, 97 ,...'
    # # names(fip)
    # #
    # # fip <- within(fip,{
    # #   m15 <- (agepf<16)
    # #   p16m20 <- ((agepf>=16) & (agepf<=20))
    # #   p21 <- (agepf>=21)
    # #   ztsai[is.na(ztsai)] <- 0
    # #   smic55 <- (ztsai >= smic*12*0.55)   ## 55% du smic mensuel brut */
    # #   famille <- 0
    # #   kid <- FALSE
    # # })

    print ""
    print "Etape 5 : récupération des enfants fip-----------"
    print "    5.1 : création de la df fip"
    fip = load_temp(name="fipDat", year=year)
    indVar_fip = [
        "noi",
        "noicon",
        "noindiv",
        "noiper",
        "noimer",
        "ident",
        "declar1",
        "naia",
        "naim",
        "lien",
        "quelfic",
        "acteu",
        "stc",
        "contra",
        "titc",
        "mrec",
        "forter",
        "rstg",
        "retrai",
        "lpr",
        "cohab",
        "ztsai",
        "sexe",
        "persfip",
        "agepr",
        "rga",
        "actrec",
        "agepf",
        "noidec",
        "year",
    ]
    fip = fip.loc[:, indVar_fip]

    # Variables auxilaires présentes dans base qu'il faut rajouter aux fip'
    # WARNING les noindiv des fip sont construits sur les ident des déclarants
    # pas d'orvelap possible avec les autres noindiv car on a des noi =99, 98, 97 ,...'
    fip["m15"] = fip["agepf"] < 16
    fip["p16m20"] = (fip["agepf"] >= 16) & (fip["agepf"] <= 20)
    fip["p21"] = fip["agepf"] >= 21
    #     fip['ztsai'][fip['ztsai'] is None] = 0 #there are alrdy zeros
    fip["smic55"] = fip["ztsai"] >= smic * 12 * 0.55
    fip["famille"] = 0
    fip["kid"] = False
    print fip["ztsai"].isnull().describe()

    # # base <- rbind(base,fip)
    # # table(base$quelfic)

    # # enfant_fip <- base[(!base$noindiv %in% famille$noindiv),]
    # # enfant_fip <- subset(enfant_fip, (quelfic=="FIP") & (( (agepf %in% c(19,20)) & !smic55 ) | (naia==year & rga=='6')) )  # TODO check year ou year-1 !
    # # enfant_fip <- within(enfant_fip,{
    # #                      noifam=100*ident+noidec
    # #                      famille=50
    # #                      kid=TRUE})
    # # #                     ident=NA}) # TODO : je ne sais pas quoi mettre un NA fausse les manips suivantes
    # # famille <- rbind(famille,enfant_fip)
    # #
    # # # TODO: En 2006 on peut faire ce qui suit car tous les parents fip sont déjà dans une famille
    # # parent_fip <- famille[famille$noindiv %in% enfant_fip$noifam,]
    # # any(enfant_fip$noifam %in% parent_fip$noindiv)
    # # parent_fip <- within(parent_fip,{
    # #                      noifam <- noindiv
    # #                      famille <- 51
    # #                      kid <- FALSE})
    # # famille[famille$noindiv %in% enfant_fip$noifam,] <- parent_fip
    # # # TODO quid du conjoint ?

    print "    5.2 : extension de base avec les fip"
    print fip[["noindiv", "noidec", "ztsai"]].describe()
    base_ = concat([base, fip])
    print len(base.index)

    enfant_fip = subset_base(base_, famille)
    print enfant_fip.ix[enfant_fip["quelfic"] == "FIP", "agepf"].describe()

    enfant_fip = enfant_fip[
        (enfant_fip["quelfic"] == "FIP")
        & (
            (enfant_fip.agepf.isin([19, 20]) & ~(enfant_fip["smic55"]))
            | ((enfant_fip["naia"] == enfant_fip["year"] - 1) & (enfant_fip["rga"].astype("int") == 6))
        )
    ]

    enfant_fip["noifam"] = 100 * enfant_fip["ident"] + enfant_fip["noidec"]
    enfant_fip["famille"] = 50
    enfant_fip["kid"] = True
    enfant_fip["ident"] = None
    control_04(enfant_fip)

    famille = concat([famille, enfant_fip])
    base = concat([base, enfant_fip])
    parent_fip = famille[famille.noindiv.isin(enfant_fip.noifam.values)]

    if any(enfant_fip.noifam.isin(parent_fip.noindiv.values)):
        print "Doublons entre enfant_fip et parent fip !"
    parent_fip["noifam"] = parent_fip["noindiv"]
    parent_fip["famille"] = 51
    parent_fip["kid"] = False
    print "contrôle de parent_fip"
    control_04(parent_fip)

    print "famille defore merge and clearing"
    control_04(famille)

    famille = famille.merge(parent_fip, how="outer")
    famille["famille"] = famille["famille"].astype("int")
    famille = famille.drop_duplicates(cols="noindiv", take_last=True)

    print "famille after merge and clearing"
    print set(famille.famille.values)
    control_04(famille)
    print famille.loc[famille.noindiv.isin(enfant_fip.noifam), "famille"].describe()
    del enfant_fip, fip, parent_fip

    # # message('Etape 6 : non attribué')
    # # non_attribue1 <- base[(!base$noindiv %in% famille$noindiv),]
    # # non_attribue1 <- subset(non_attribue1,
    # #                         (quelfic!="FIP") & (m15 | (p16m20&(lien %in% c(1,2,3,4) & agepr>=35)))
    # #                         )
    # # # On rattache les moins de 15 ans avec la PR (on a déjà éliminé les enfants en nourrice)
    # # non_attribue1 <- merge(pr,non_attribue1)
    # # non_attribue1 <- within(non_attribue1,{
    # #   famille <- ifelse(m15,61,62)
    # #     kid <- TRUE })
    # #
    # # rm(pr)
    # # famille <- rbind(famille,non_attribue1)
    # # dup <- duplicated(famille$noindiv)
    # # table(dup)
    # # rm(non_attribue1)
    # # table(famille$famille, useNA="ifany")
    # #
    # # non_attribue2 <- base[(!base$noindiv %in% famille$noindiv) & (base$quelfic!="FIP"),]
    # # non_attribue2 <- within(non_attribue2,{
    # #   noifam <- 100*ident+noi # l'identifiant est celui du jeune */
    # #     kid<-FALSE
    # #     famille<-63})
    # #
    # # famille <- rbind(famille,non_attribue2)
    print ""
    print "Etape 6 : gestion des non attribués"
    print "    6.1 : non attribués type 1"
    non_attribue1 = subset_base(base, famille)
    non_attribue1 = non_attribue1[
        ~(non_attribue1["quelfic"] != "FIP")
        & (
            non_attribue1["m15"]
            | (non_attribue1["p16m20"] & (non_attribue1.lien.isin(range(1, 5))) & (non_attribue1["agepr"] >= 35))
        )
    ]
    # On rattache les moins de 15 ans avec la PR (on a déjà éliminé les enfants en nourrice)
    non_attribue1 = pr.merge(non_attribue1)
    control_04(non_attribue1)
    non_attribue1["famille"] = where(non_attribue1["m15"], 61, 62)
    non_attribue1["kid"] = True

    famille = concat([famille, non_attribue1])
    control_04(famille)
    del pr, non_attribue1

    print "    6.2 : non attribué type 2"
    non_attribue2 = base[(~(base.noindiv.isin(famille.noindiv.values)) & (base["quelfic"] != "FIP"))]
    non_attribue2["noifam"] = 100 * non_attribue2["ident"] + non_attribue2["noi"]
    non_attribue2["noifam"] = non_attribue2["noifam"].astype("int")
    non_attribue2["kid"] = False
    non_attribue2["famille"] = 63

    famille = concat([famille, non_attribue2], join="inner")
    control_04(famille)
    del non_attribue2

    # # ## Sauvegarde de la table famille */
    # #
    # # # TODO nettoyer les champs qui ne servent plus à rien
    # #
    print ""
    print "Etape 7 : Sauvegarde de la table famille"
    print "    7.1 : Mise en forme finale"
    famille["idec"] = famille["declar1"].str[3:11]
    print famille["declar1"].notnull().describe()
    famille["idec"].apply(lambda x: str(x) + "-")
    famille["idec"] += famille["declar1"].str[0:2]
    famille["chef"] = famille["noifam"] == famille["ident"] * 100 + famille["noi"]

    famille.reset_index(inplace=True)
    print famille["idec"].isnull().describe()

    control_04(famille)

    print "    7.2 : création de la colonne rang"
    famille["rang"] = famille["kid"].astype("int")

    while any(famille[(famille["rang"] != 0)].duplicated(cols=["rang", "noifam"])):
        famille["rang"][famille["rang"] != 0] = where(
            famille[famille["rang"] != 0].duplicated(cols=["rang", "noifam"]),
            famille["rang"][famille["rang"] != 0] + 1,
            famille["rang"][famille["rang"] != 0],
        )
        print "nb de rangs différents", len(set(famille.rang.values))

    print "    7.3 : création de la colonne quifam et troncature"

    print "value_counts chef", famille["chef"].value_counts()
    print "value_counts kid", famille["kid"].value_counts()

    famille["quifam"] = -1
    print "controle initial", len(famille[famille["quifam"] == -1])
    famille["quifam"] = where(famille["chef"], 0, famille["quifam"])
    famille["quifam"] = where(famille["kid"], 1 + famille["rang"], famille["quifam"])
    famille["quifam"] = where(~(famille["chef"]) & ~(famille["kid"]), 1, famille["quifam"])

    famille["noifam"] = famille["noifam"].astype("int")
    print famille["quifam"].value_counts()

    famille_check = famille
    famille = famille.loc[:, ["noindiv", "quifam", "noifam"]]
    famille.columns = ["noindiv", "quifam", "idfam"]

    print "Vérifications sur famille"
    assert len(famille_check.loc[famille_check["chef"], :]) == len(
        set(famille.idfam.values)
    ), "the number of family chiefs is different from the number of families"
    assert not (any(famille.duplicated(cols=["idfam", "quifam"]))), "there are duplicates of quifam inside a family"
    assert famille["quifam"].notnull().all(), "there are missing values in quifam"
    assert famille["idfam"].notnull().all(), "there are missing values in idfam"
    #    control(famille, debug=True, verbose=True, verbose_columns=['idfam', 'quifam'])

    print "    Sauvegarde de famille"
    save_temp(famille, name="famc", year=year)
    del famille_check, indivi, enfnn