def close_session(session, first_timestamp_ts):
    size = len(session)

    # Creating and artificial session id based on the first click timestamp and a hash of user id
    first_click = session[0]
    session_id = (int(first_click['timestamp']) * 100) + hash_str_to_int(
        first_click['user_id'].encode(), 3)
    session_hour = int(
        (first_click['timestamp'] - first_timestamp_ts) /
        (1000 * 60 *
         60))  # Converting timestamp to hours since first timestamp

    # Converting to Spark DataFrame Rows, to convert RDD back to DataFrame
    # TODO add 'view' here
    clicks = list([T.Row(**click) for click in session])
    session_dict = {
        'session_id': session_id,
        'session_hour': session_hour,
        'session_size': size,
        'session_start': first_click['timestamp'],
        'user_id': first_click['user_id'],
        'clicks': clicks
    }
    session_row = T.Row(**session_dict)

    return session_row
    def test_as_list_shallow(self):
        input = [pst.Row(id=1, a=pst.Row(b=101))]
        input_df = self.spark.createDataFrame(input)

        actual = as_list(input_df, False)
        expect = [{'id': 1, 'a': pst.Row(b=101)}]

        self.assertEqual(actual, expect)
    def test_as_list_deep(self):
        input = [pst.Row(id=1, a=pst.Row(b=101))]
        input_df = self.spark.createDataFrame(input)

        actual = as_list(input_df, True)
        expect = [{'id': 1, 'a': {'b': 101}}]

        self.assertEqual(actual, expect)
Example #4
0
def test_as_list_shallow(spark):
    """as_list converts rows to dictionary without deep"""
    input = [pst.Row(id=1, a=pst.Row(b=101))]
    input_df = spark.createDataFrame(input)

    actual = to_list(input_df, False)
    expect = [{'id': 1, 'a': pst.Row(b=101)}]

    assert actual == expect
Example #5
0
def test_as_list_deep(spark):
    """as_list converts rows to dictionary deeply"""
    input = [pst.Row(id=1, a=pst.Row(b=101))]
    input_df = spark.createDataFrame(input)

    actual = to_list(input_df, True)
    expect = [{'id': 1, 'a': {'b': 101}}]

    assert actual == expect
def test_as_list_deep_convert_fields(spark):
    """Given a dataframe with nested structure fields, 
       When as_list with shallow=True is called,
       Then result is a list of dictionaries with structure field not converted"""
    input = [(1, pst.Row(first_name='John'))]
    input_df = spark.createDataFrame(input, ['id', 'person'])

    actual = as_list(input_df, False)
    expect = [dict(id=1, person=pst.Row(first_name='John'))]

    assert actual == expect
    def _create_train_image_uris_and_labels(self,
                                            repeat_factor=1,
                                            cardinality=100,
                                            dense=True):
        image_uris = getSampleImagePaths() * repeat_factor
        # Create image categorical labels (integer IDs)
        local_rows = []
        for uri in image_uris:
            label = np.random.randint(low=0, high=cardinality, size=1)[0]
            if dense:
                label_inds = np.zeros(cardinality)
                label_inds[label] = 1.0
                label_inds = label_inds.ravel()
                assert label_inds.shape[0] == cardinality, label_inds.shape
                one_hot_vec = spla.Vectors.dense(label_inds.tolist())
            else:  # sparse
                one_hot_vec = spla.Vectors.sparse(cardinality, {label: 1})
            _row_struct = {
                self.input_col: uri,
                self.one_hot_col: one_hot_vec,
                self.one_hot_label_col: float(label)
            }
            row = sptyp.Row(**_row_struct)
            local_rows.append(row)

        image_uri_df = self.session.createDataFrame(local_rows)
        return image_uri_df
def appendWeatherData(ts, stID, tsVec, airTemp, cloudCov, precip1Hr):
    # Returns weather information to be included in taxi trip
    hrInSec = 3600  # 1 hour = 3600 seconds
    if not ts == None:
        deltaTime = ts - tsVec[stID][0]
        ind = int(round(truediv(deltaTime, hrInSec)))
        if ind < 0:
            ind = 0
        elif ind >= len(tsVec[stID]):
            ind = len(tsVec[stID]) - 1
        return sqlt.Row('airTemp'          , 'cloudCov'         , 'pricip1Hr')\
                        (airTemp[stID][ind], cloudCov[stID][ind], precip1Hr[stID][ind])

    else:
        return sqlt.Row('airTemp', 'cloudCov', 'pricip1Hr')\
                        (None    ,  None     ,  None)
Example #9
0
def fill_not_null_values(elements):
    ordered_dict = OrderedDict()
    for element in elements:
        ordered_dict[element.timestamp] = element
    bid = None
    ask = None
    price = None
    quantity = None
    for (key, element) in ordered_dict.items():
        if element.bid is not None:
            bid = element.bid
        if element.ask is not None:
            ask = element.ask
        if element.price is not None:
            price = element.price
        if element.quantity is not None:
            quantity = element.quantity
        row = T.Row(id=element.id,
                    timestamp=element.timestamp,
                    bid=bid,
                    ask=ask,
                    price=price,
                    quantity=quantity)
        ordered_dict[key] = row
    return ordered_dict.values()
Example #10
0
 def take_log_in_all_columns(row: types.Row):
     old_row = row.asDict()
     new_row = {
         f'log({column_name})': math.log(value)
         for column_name, value in old_row.items()
     }
     return types.Row(**new_row)
def extract_article(row):
    """ Extract the content of the article.
    normalize the titles"""
    #     redirect = row.page_redirect_title if row.page_redirect_title is not None else ""
    return T.Row(pid=row.page_id,
                 title=normalise_title(row.page_title),
                 title_rd=normalise_title(row.page_redirect_title),
                 wikitext=row.revision_text)
Example #12
0
def nulls(row: T.Row) -> T.Row:
    d = row.asDict()
    _cnt = 0
    for _var in d.keys():
        if d[_var] is None:
            _cnt += 1
    d['nullcnt'] = _cnt
    return T.Row(**d)
Example #13
0
    def test_row(self):
        first = T.Row(one=1, two=2, three=3, four=4)
        second = T.Row(three=3, two=2, four=4, one=1)

        # Spark currently sorts the fields of each row internally
        # so these will match...
        SparklyTest().assertRowsEqual(first, second)
        self.assertEqual(first, second)

        # but since Rows extend tuples, only the values are checked as
        # long as the fields define the same alpha order
        first = T.Row(one=1, two=2, three=3, four=4)
        second = T.Row(th=3, tw=2, f=4, o=1)

        # We fix this in our version by default
        with self.assertRaises(AssertionError):
            SparklyTest().assertRowsEqual(first, second)
        self.assertEqual(first, second)
def get_valid_ngrams(row):
    text = row.chunk
    found_anchors = []
    for n in range(10, 0, -1):
        ngrams = get_ngrams(text, n)
        for ng in ngrams:
            if ng in anchors_keys:
                found_anchors.append(ng)
#                 text.replace(ng, " @ ")
    return [T.Row(pid=row.pid, anchor=a) for a in found_anchors]
Example #15
0
            def _(orig_row):
                orig_rows = orig_row.rows
                new_rows = [list(row) for row in orig_rows]
                for column, (datatype, fn) in columns.items():
                    fn_rows = fn(orig_rows)
                    for i, orig_row in enumerate(orig_rows):
                        new_rows[i].append(fn_rows[orig_row])

                NewRow = pyspark_types.Row(*schema_names)
                return [NewRow(*row) for row in new_rows]
    def test_i_can_fly(self):
        input = [pst.Row(a=1, b=2)]
        input_df = self.spark.createDataFrame(input)

        expect = [{'a': 1}]

        actual_df = input_df.select("a")
        actual = as_list(actual_df)

        self.assertEqual(actual, expect)
Example #17
0
def create_spark_dataframe_from_list(label_list):
  # Create image categorical labels (integer IDs)
  local_rows = []
  for label in label_list:
    _row_struct = {"label": label }
    row = sptyp.Row(**_row_struct)
    local_rows.append(row)

  dataframe = sqlContext.createDataFrame(local_rows)
  return dataframe
Example #18
0
def encode_authors(actors_str):
    actors = [a.strip().lower() for a in actors_str.split(",")]

    ids = []
    for a in actors:
        ids.append(actors_id_dict[a])

    ids = sorted(ids) + (4 - len(ids)) * [None]

    return t.Row("actor_id_0", "actor_id_1", "actor_id_2", "actor_id_3")(*ids)
Example #19
0
def generate_vector_df(spark, glove, vocab_df):
    vector_schema = T.StructType([
        T.StructField('id', T.IntegerType(), True),
        T.StructField('vector', T.ArrayType(T.DoubleType()), True)
    ])

    vector_df = spark.sparkContext \
        .parallelize([(i, [float(d) for d in glove.word_vectors[i]]) for i in range(len(glove.word_vectors))]) \
        .map(lambda t: T.Row(id=t[0], vector=t[1])).toDF(vector_schema)

    return vector_df
Example #20
0
def get_sentiment(tweet="Default"):
    """
    udf to return the sentiment of the tweet.
    :return: -1 0 1 for Negative, Neutral and Positive sentiment.
    """
    tweet_json = tweet
    return t.Row('id', 'full_text', 'len', 'in_reply_to_status_id', 'date',
                 'source', 'likes', 'retweet', 'sent_by', 'friend_of', 'hash_tag') \
        (tweet_json['id'], tweet_json['full_text'], tweet_json['len'], tweet_json['in_reply_to_status_id'],
         tweet_json['date'], tweet_json['source'], tweet_json['likes'], tweet_json['retweet'],
         tweet_json['sent_by'], tweet_json['friend_of'], tweet_json['hash_tag'])
def get_links(page):
    links = []
    for m in links_regex.findall(page.wikitext):
        link = normalise_title(m[0])
        anchor = m[1] if len(m) > 1 and len(m[1]) > 0 else link
        if len(link) > 0:
            links.append(
                T.Row(pid=page.pid,
                      title=page.title,
                      link=link,
                      anchor=normalise_anchor(anchor)))
    return links
Example #22
0
def mse(row: T.Row) -> T.Row:
    d = row.asDict()
    _mse = 0.0
    if d['Sales_Pred'] is None:
        print("'Sales_Pred'=None")
        _mse = 0
    elif d['sales'] is None:
        _mse = d['Sales_Pred']**2
    else:
        _mse = (d['Sales_Pred'] - d['sales'])**2
    d['mse'] = _mse
    return T.Row(**d)
Example #23
0
def tag_ori(rg, oi, user_orient):
    """
    用户画像标签解析
    """
    tag_set = set()
    try:
        province = "0"
        city = "0"
        if rg is not None and "_" in rg:
            province, city = rg.split("_", 1)

        if oi is not None:
            tag_set.add("19_%s" % oi)

        if user_orient not in [None, '']:
            for tag in user_orient.split(","):
                if "_" in tag:
                    prefix = tag.split("_", 1)[0]
                    if prefix in ["12", "1", "13", "14", "15", "16", "17", "8"]:
                        tag_set.add(tag)
        return T.Row('province', 'city', 'tags')(province, city, ",".join(sorted(tag_set, key=lambda x: map(int, x.split("_")))))
    except Exception as err:
        return T.Row('province', 'city', 'tags')("", "", "")
    def CreateTrainImageUriandLabels(image_uris, label, label_name, cardinality, isDefault):
        # Create image categorical labels (integer IDs)
        local_rows = []
        for uri in image_uris:
            label_inds = np.zeros(cardinality)
            label_inds[label] = 1.0
            one_hot_vec = spla.Vectors.dense(label_inds.tolist())
            _row_struct = {"uri": uri, "one_hot_label": one_hot_vec, "label": int(label),
                           "label_name": str(label_name), "isDefault": int(isDefault)}
            row = sptyp.Row(**_row_struct)
            local_rows.append(row)

        image_uri_df = sqlContext.createDataFrame(local_rows)
        return image_uri_df
def omdb_data(arguments):
    movie_name, year = arguments
    client = OMDBClient(apikey=OMDB_API_KEY)
    try:
        result = client.get(title=movie_name, year=year, fullplot=True, tomatoes=True)
    except HTTPError as e:
        print(e)

        client.set_default("apikey", OMDB_API_KEY_fallback)

        result = client.get(title=movie_name, year=year, fullplot=True, tomatoes=True)

    result_to_keep = {}

    for key in requested_flat_fields:
        result_to_keep[key] = result.get(key, None)

    for nested_field in requested_nested_fields:
        requested_nested_list = requested_nested_fields[nested_field]
        nested_list = result.get(nested_field, None)

        if nested_list:
            for nested_dict in nested_list:
                source = nested_dict.get("source", None)

                if source:
                    value = nested_dict.get("value", None)

                    if source in requested_nested_list:

                        source_formatted = to_snake_case(source)
                        key = f"{nested_field}_{source_formatted}"

                        result_to_keep[key] = value

            requested_sources = requested_nested_fields[nested_field]
            for requested_source in requested_sources:
                source_formatted = to_snake_case(requested_source)
                key = f"{nested_field}_{source_formatted}"
                if not key in result_to_keep:
                    result_to_keep[key] = None

        else:
            requested_sources = requested_nested_fields[nested_field]
            for requested_source in requested_sources:
                source_formatted = to_snake_case(requested_source)
                key = f"{nested_field}_{source_formatted}"
                result_to_keep[key] = None

    return t.Row(*list(result_to_keep.keys()))(*list(result_to_keep.values()))
def get_plain_text_without_links(row):
    """ Replace the links with a dot to interrupt the sentence and get the plain text """
    wikicode = row.wikitext
    wikicode_without_links = re.sub(links_regex, '.', wikicode)
    wikicode_without_links = re.sub(references_regex, '.',
                                    wikicode_without_links)
    ## we dont have mwparserfromhell on the spark-cluster yet
    try:
        text = mwparserfromhell.parse(wikicode_without_links).strip_code()
    except:
        text = wikicode_without_links
    text = wikicode_without_links
    return T.Row(pid=row.pid,
                 title=normalise_title(row.title),
                 text=text.lower())
def stats_from_id(video_id):
    if not video_id:
        return None, None, None

    youtube = youtube_utils.get_authenticated_service(
        api_service_name, api_version, scopes, n_tries=0
    )

    n_tries = 0
    success = False
    while not success and n_tries < 19:
        try:
            request = youtube.videos().list(part="statistics", id=video_id)
            response = request.execute()
            success = True

        except HttpError as e:
            n_tries += 1
            youtube = youtube_utils.get_authenticated_service(
                api_service_name, api_version, scopes, n_tries=n_tries,
            )
    if not success:
        return None, None, None

    try:
        stats = response["items"][0]["statistics"]

        view_count = int(stats["viewCount"])
        like_count = int(stats["likeCount"])
        dislike_count = int(stats["dislikeCount"])

        engagement_score = (like_count + dislike_count) / view_count
        positive_engagement_score = like_count / dislike_count

    except KeyError as e:
        return None, None, None

    return t.Row(
        "youtube_view_count",
        "youtube_engagement_score",
        "youtube_positive_engagement_score",
    )(view_count, engagement_score, positive_engagement_score)
Example #28
0
def compute_tweet_sentiment(msg):
    parameters = {'tweet': msg}
    r = requests.get(url=SENTIMENT_SERVER_URL, params=parameters)
    sentiment = 1
    psentiment = 0
    ngsentiment = 0
    nsentiment = 0

    nltk_sentiment = 1
    nltk_psentiment = 0
    nltk_ngsentiment = 0
    nltk_nsentiment = 0

    if r.status_code == 200:
        data = r.json()
        sentiment = data['Sentiment']
        nltk_sentiment = data['Sentiment_nltk']

        if sentiment == 0:
            ngsentiment = 1
        elif sentiment == 1:
            nsentiment = 1
        elif sentiment == 2:
            psentiment = 1

        if nltk_sentiment == 0:
            nltk_ngsentiment = 1
        elif nltk_sentiment == 1:
            nltk_nsentiment = 1
        elif nltk_psentiment == 2:
            nltk_psentiment = 1

        print(data)

    return t.Row('sentiment', 'psentiment', 'ngsentiment', 'nsentiment',
                 'nltk_sentiment', 'nltk_psentiment', 'nltk_ngsentiment',
                 'nltk_nsentiment')(sentiment, psentiment, ngsentiment,
                                    nsentiment, nltk_sentiment,
                                    nltk_psentiment, nltk_ngsentiment,
                                    nltk_nsentiment)
    ## main namespace
    .where(F.col('page_namespace') == 0)
    ## no redirect-pages
    #     .where(F.col('page_redirect_title')=='')
    .where(F.col('revision_text').isNotNull()
           ).where(F.length(F.col('revision_text')) > 0))

## extracting pid, title, title_rd, and the wikitext
## titles are normalized
wikipedia = spark.createDataFrame(
    wikipedia_all.rdd.map(extract_article).filter(lambda r: r is not None))

## only redirects
redirects = spark.createDataFrame(
    wikipedia.where(F.col('title_rd') != '').rdd.map(
        lambda r: T.Row(title_from=r.title, title_to=r.title_rd))).distinct()

## only articles (no redirect title)
articles = (wikipedia.where(F.col('title_rd') == '').select(
    'pid', 'title', 'wikitext'))

## extract the links
links = spark.createDataFrame(articles.rdd.flatMap(get_links))

links_resolved = (
    links.join(
        redirects, links['link'] == redirects['title_from'],
        how='leftouter').select(
            'pid',
            'title',
            'anchor',
def get_chunks(row):
    return [
        T.Row(pid=row.pid, chunk=blocks.strip())
        for blocks in re.split('[\n\.,;:()!"]', row.text)
        if len(blocks.strip()) > 0
    ]