Ejemplo n.º 1
0
def calc_jaccard_sim(df_to_process, df_match, thresh=.3, padded=True):
    if padded:
        df_processed = df_to_process.join(df_match, (F.size(
            F.array_intersect(
                df_to_process.ngrams_pad, df_match.ngrams_pad)) / F.size(
                    F.array_union(df_to_process.ngrams_pad,
                                  df_match.ngrams_pad))) > thresh)
    else:
        df_processed = df_to_process.join(
            df_match,
            (F.size(F.array_intersect(df_to_process.ngrams, df_match.ngrams)) /
             F.size(F.array_union(df_to_process.ngrams, df_match.ngrams))) >
            thresh)
    return df_processed
Ejemplo n.º 2
0
def get_df_mincityear_onw_cit(df_ani):
    return (df_ani.filter(sort_pub_year + ' >= ' + mincityear).withColumn(
        'references_u', func.array_distinct('references')).select(
            func.col('Eid').alias('CitingEid'),
            func.explode('references_u').alias('Eid'),
            func.when(
                func.col('source.srcid').isin(discontinued_sources),
                func.lit(int(1))).otherwise(func.lit(
                    int(0))).alias('isDiscontinuedCiting'),
            func.col('Au.auid').cast('array<long>').alias('CitingAuids')
        ).join(
            df_ani.select(
                'Eid',
                func.col('Au.auid').cast('array<long>').alias('CitedAuids')),
            ["Eid"]).withColumn(
                'overLappingAuthors',
                func.size(func.array_intersect(
                    'CitingAuids', 'CitedAuids'))).select(
                        "CitingEid",
                        "Eid",
                        'isDiscontinuedCiting',
                        func.expr("IF(overLappingAuthors>0,1,0)").alias(
                            'isSelfCitation'),
                        func.expr("IF(overLappingAuthors>0,NULL,CitingEid)").
                        alias('CitingEidNonSelf'),
                    ).groupBy('Eid').agg(
                        func.count('*').alias('CitationCount'),
                        func.sum('isSelfCitation').alias('SelfCitationCount'),
                        (func.count('*') - func.sum('isSelfCitation')
                         ).alias('CitationCountNonSelf'),
                        func.collect_list('CitingEid').alias('CitingEids'),
                        func.collect_list('CitingEidNonSelf').alias(
                            'CitingEidsNonSelf'),
                        func.sum("isDiscontinuedCiting").alias(
                            'CitationCountFromDiscontinuedSources')))
Ejemplo n.º 3
0
def count_neighborhood(df_to_process):
    df_cross_join = df_to_process.crossJoin(df_pre_neighborhood)
    df_processed = df_cross_join.withColumn(
        "size", F.size(F.array_intersect("token_filtered", "to_match")))
    df_street = df_processed.filter(df_processed.size != 0)
    df_left = df_processed.filter(
        df_processed.size == 0).drop("to_match").drop("size")
    return "neighborhood", df_left, df_street.select(
        F.sum("_c1"),
        F.lit('neighborhood').alias("sem_type"))
Ejemplo n.º 4
0
def get_around_class_except():
    try:
        print(f"{str(dt.now())}  零售店订购烟品规异常")
        co_cust = get_valid_co_cust(spark).select("cust_id")

        co_co_line = get_co_co_line(spark,scope=[1,1],filter="month")\
                                  .select("cust_id","item_id","qty_ord")

        plm_item = get_plm_item(spark).select("item_id", "item_name")

        # 1.每个零售户 每类烟 的数量
        cust_item_sum = co_co_line.join(plm_item, "item_id") \
            .groupBy("cust_id", "item_name") \
            .agg(f.sum("qty_ord").alias("cust_item_sum"))

        #2.每个零售户订购量前三的烟
        win = Window.partitionBy("cust_id").orderBy(
            col("cust_item_sum").desc())
        rank3 = cust_item_sum.withColumn("rank", f.row_number().over(win)) \
            .where(col("rank") <= 3) \
            .groupBy("cust_id") \
            .agg(f.collect_list("item_name").alias("items"))

        win = Window.partitionBy("cust_id1").orderBy(
            col("one_km_item_sum").desc())
        #每个零售户cust_id1 周边的零售户cust_id0
        around_cust = get_around_cust(spark, 1).select("cust_id1", "cust_id0")
        """
          零售户cust_id1周边包含cust_id0这些零售户 
        1.第一个join,计算每个零售户cust_id1一公里内有哪些零售户cust_id0
        2.第二个join,一公里内的cust_id0这些零售户所定各类烟的数量   cust_id0 与 cust_item_sum的cust_id
        3.根据cust_id1 item_name 计算一公里内各类烟的数量
        """
        #3.每个零售户一公里内所定烟的前三
        one_km_rank3 = around_cust.join(cust_item_sum, col("cust_id0") == col("cust_id")) \
            .select("cust_id1", "item_name", "cust_item_sum") \
            .groupBy("cust_id1", "item_name") \
            .agg(f.sum("cust_item_sum").alias("one_km_item_sum")) \
            .withColumn("rank", f.row_number().over(win)) \
            .where(col("rank") <= 3) \
            .groupBy("cust_id1") \
            .agg(f.collect_list("item_name").alias("one_km_items"))

        colName = ["regulation_abno", "ciga_top3_last_month", "ciga_top3_km"]
        #四.求交集 长度为0,异常
        rank3.join(one_km_rank3, col("cust_id") == col("cust_id1")) \
            .where(f.size(f.array_intersect(col("items"), col("one_km_items"))) == 0) \
            .withColumn(colName[0],f.lit(1))\
            .withColumnRenamed("items",colName[1])\
            .withColumnRenamed("one_km_items",colName[2])\
            .join(co_cust,"cust_id")\
            .foreachPartition(lambda x:write_hbase1(x,colName,hbase))
    except Exception as e:
        tb.print_exc()
Ejemplo n.º 5
0
def jaccard_index(primary_col: str, secondary_col: str, output_col: str,
                  df: DataFrame):
    """Calculate the intersection and union of two array columns"""

    return df.withColumn(
        output_col,
        F.when(
            F.col(primary_col).isNull() | F.col(secondary_col).isNull(), None).
        otherwise(
            F.size(F.array_intersect(F.col(primary_col), F.col(secondary_col)))
            / F.size(F.array_union(F.col(primary_col), F.col(secondary_col)))),
    )
Ejemplo n.º 6
0
def scd_analyze(df, merge_on=None, state_col='_state', updated_col='_updated'):
    add_ids = '##add_ids'
    del_ids = '##del_ids'
    upd_ids = '##upd_ids'

    c = set(df.columns).difference({state_col, updated_col})
    colnames = [x for x in df.columns if x in c]

    on = merge_on or colnames
    on = on if isinstance(on, (list, tuple)) else [on]
    on = [c for c in on if c in colnames]

    s = on + [state_col, updated_col]
    cols = [x for x in df.columns if x not in s]

    a = df.filter(f'{state_col} = 0') \
        .groupby(updated_col) \
        .agg(F.collect_set(F.concat(*on)).alias(add_ids)) \
        .select(updated_col, add_ids)

    d = df.filter(f'{state_col} = 1') \
        .groupby(updated_col) \
        .agg(F.collect_set(F.concat(*on)).alias(del_ids)) \
        .select(updated_col, del_ids)

    res = a.join(d, on=updated_col, how='outer')
    res = res.select(updated_col,
                     F.coalesce(add_ids, F.array([])).alias(add_ids),
                     F.coalesce(del_ids, F.array([])).alias(del_ids))

    if cols:
        agg_funcs = [(F.countDistinct(x) - F.lit(1)).alias(x) for x in cols]
        cnt = df.groupby(*on, updated_col).agg(*agg_funcs)

        agg_names = [F.lit(x) for x in cols]
        agg_sums = [F.sum(x) for x in cols]
        cnt = cnt.groupby(updated_col).agg(
            F.map_from_arrays(F.array(*agg_names),
                              F.array(*agg_sums)).alias('changes'))

        res = res.join(cnt, on=updated_col)
    else:
        res = res.withColumn('changes', F.lit(None))

    res = res.select('*', F.array_intersect(add_ids, del_ids).alias(upd_ids))
    res = res.select(
        F.col(updated_col).alias('updated'),
        F.size(upd_ids).alias('upd'),
        F.size(F.array_except(add_ids, upd_ids)).alias('add'),
        F.size(F.array_except(del_ids, upd_ids)).alias('del'), 'changes')

    return res.orderBy('updated')
Ejemplo n.º 7
0
def count_address_street_name(df_to_process):
    udf_address_regex = F.udf(address_regex)
    df_cross_join = df_to_process.crossJoin(df_pre_street)
    df_processed = df_cross_join.withColumn(
        "size", F.size(F.array_intersect("token_filtered", "to_match")))
    df_street = df_processed.filter(df_processed.size != 0).withColumn(
        "sem_type", F.lit(udf_address_regex(df_processed._c0)))
    df_left = df_processed.filter(
        df_processed.size == 0).drop("to_match").drop("size")
    return "address_street_name", df_left, df_street.groupBy('sem_type').agg({
        '_c1':
        'sum'
    }).select('sum(_c1)', 'sem_type')
    def transform(self, date):
        self.df = self.df \
                      .groupBy('author') \
                      .agg(F.collect_set("link_id" ).alias('link_ids')) \
                      .repartition('author')

        self.df = self.df.alias('df1') \
                .join(self.df.alias('df2')) \
                .where('df1.author < df2.author') \
                .select(F.col('df1.author').alias('author_1'), \
                    F.col('df2.author').alias('author_2'), \
                    F.size(F.array_intersect('df1.link_ids',
                                             'df2.link_ids')) \
                        .alias('weight')) \
                .where('weight > %d' % self.truncation)
        return self
Ejemplo n.º 9
0
def similaryBasedOnFollowers(data, minFollowers=20, debug=False):

    # We start by renaming the user column in line with the notation
    # above.
    data = data.withColumnRenamed('follows', 'u1')

    # ==== Step 1 ====
    u1_fu1 = data.groupBy('u1').agg(F.collect_set(
        data.user).alias('fu1')).filter(F.size('fu1') >= minFollowers)

    if (debug):
        print('>> Step 1 :: u1 f(u1) <<')
        u1_fu1.show()

    # ==== Step 2 ====
    # First create a "dual" of data by renaming columns.
    # This will help the subsequent join.
    u2_fu2 = u1_fu1.withColumnRenamed('u1',
                                      'u2').withColumnRenamed('fu1', 'fu2')

    prod = u1_fu1.crossJoin(u2_fu2).filter(u1_fu1.u1 < u2_fu2.u2)

    if (debug):
        print('>> Step 2 :: u1 f(u1) u2 f(u2) <<')
        prod.show()

    # ==== Step 3 ====
    prod2 = prod.withColumn('I',
                            F.array_intersect(prod.fu1, prod.fu2)).withColumn(
                                'U',
                                F.array_union(prod.fu1,
                                              prod.fu2)).drop('fu1', 'fu2')

    if (debug):
        print('>> Step 3 :: u1 u2 I(u1,u2) U(u1,u2) <<')
        #prod2.orderBy('I',ascending=False).show()
        prod2.show()

    # ==== Step 4 ====
    result = prod2.withColumn('JI', F.size('I') / F.size('U')).drop('I', 'U')

    if (debug):
        print('>> Step 4 :: u1 u2 J(u1,u2) <<')
        result.show()
    return result
Ejemplo n.º 10
0
	def verification(self, candDF, threshold, key1, key2, keep_cols1, keep_cols2):
		""" 
			Input: $candDF is the output DataFrame from the 'filtering' function. 
				   $threshold is a float value between (0, 1] 

			Output: Return a new DataFrame $resultDF that represents the ER result. 
					It has five columns: id1, joinKey1, id2, joinKey2, jaccard 

			Comments: There are two differences between $candDF and $resultDF
					  (1) $resultDF adds a new column, called jaccard, which stores the jaccard similarity 
						  between $joinKey1 and $joinKey2
					  (2) $resultDF removes the rows whose jaccard similarity is smaller than $threshold 
		"""
		return candDF.select(
			'id1', 'id2',
			(size(array_intersect(key1,key2))\
			/ size(array_union(key1,key2))).alias('jaccard'),
			# keep certain columns
			*keep_cols1, *keep_cols2
		).where(col('jaccard') >= threshold)
Ejemplo n.º 11
0
def array_intersection(a: Column, b: Column) -> Column:
    """Calculate the intersection of two array columns"""
    return F.array_remove(F.array_intersect(a, b), "")
Ejemplo n.º 12
0
         f.size(f.array_distinct("stem.result")).alias(
             "title_n_distinct_words"),
         f.size(f.expr("filter(pos.result, x -> x like 'V%')")
                ).alias("title_n_verbs"),
         f.size(f.expr("filter(pos.result, x -> x like 'N%')")
                ).alias("title_n_nouns"),
         f.size(f.expr("filter(pos.result, x -> x like 'PR%')")
                ).alias("title_n_pronouns"),
         f.size(f.expr("filter(pos.result, x -> x like 'J%')")
                ).alias("title_n_adjectives"),
         f.size(f.expr("filter(pos.result, x -> x like 'RB%')")
                ).alias("title_n_adverbs"),
         f.array_distinct(f.col("stem.result")).alias("title_words")
         )\
 .withColumn("title_in_body_perc",
             f.size(f.array_intersect(f.col("title_words"), f.col("body_words")))/f.col("title_n_distinct_words"))\
 .selectExpr("dataset_name",
             "post_id",
             "body_clean_nocode",
             "title",
             "body_n_sentences",
             "body_n_words",
             "body_n_distinct_words",
             "body_n_verbs",
             "body_n_nouns",
             "body_n_pronouns",
             "body_n_adjectives",
             "body_n_adverbs",
             "title_n_words",
             "title_n_distinct_words",
             "title_n_verbs",
Ejemplo n.º 13
0
readSongsDF.unpersist()

print("Insert playlists")
createPlaylistsDF = df_edit.withColumn(
    'Exp_Results', F.explode('create.playlists')).select('Exp_Results.*')
createPlaylistsDF.show(truncate=False)

# Only consider playlists not in the source playlists
print("Insert playlists Result")
createPlaylistsDF = createPlaylistsDF.join(
    readPlaylistsDF, createPlaylistsDF.id == readPlaylistsDF.id,
    'leftanti').join(readUserDF, createPlaylistsDF.user_id == readUserDF.id,
                     'inner').select(
                         createPlaylistsDF.id,
                         F.array_intersect(createPlaylistsDF.song_ids,
                                           F.array([F.lit(x) for x in songs
                                                    ])).alias("song_ids"),
                         createPlaylistsDF.user_id)
readPlaylistsDF = readPlaylistsDF.union(createPlaylistsDF)
createPlaylistsDF.unpersist()
readUserDF.unpersist()

readPlaylistsDF.orderBy('id').show()

print("Delete playlists")
deletePlaylistsDF = df_edit.withColumn(
    'id', F.explode('delete.playlist_ids')).select("id")
deletePlaylistsDF.show(truncate=False)

# Only delete playlists that exist in the source playlists
print("Delete playlists Result")
Ejemplo n.º 14
0
def tag_info_df(spark):
    """ Extract features from the tags of a post

    Args:
        spark (SparkSession): used to run queries and commands

    Returns:
        DataFrame: With columns [
            (post)_Id,
            #tags,
            contains_language_tag,
            contains_platform_tag
        ]
    """
    language_list = ["abap", "abc", "actionscript", "ada", "algol", "algol 58", "algol 60", "algol w", "algol 68",
                     "alice", "amiga e", "apex", "apl", "applescript", "argh!", "aargh!", "assembly",
                     "assembly language", "autolisp", "autolt", "awk",
                     "b", "bash", "basic", "ballerina", "bbc basic", "bc", "bcpl", "blitz basic", "bourne shell",
                     "brainfuck",
                     "c", "c++", "c#", "cfml", "cl", "classic visual basic", "clean", "clipper", "clojure", "cobol",
                     "comal", "common lisp", "coffeescript", "crystal", "c shell", "ct",
                     "d", "darkbasic", "dart", "decimal basic", "delphi", "delta cobol", "div games studio",
                     "egl", "eiffel", "elixir", "elm", "emacs lisp", "erlang", "euphoria",
                     "f#", "factor", "fenix project", "forth", "fortran", "foxpro",
                     "gambas", "gcl", "gml", "go", "grasshopper", "groovy",
                     "hack", "haskell", "hypertalk",
                     "icon", "inform", "io", "ironpython",
                     "j", "just another language", "java", "javascript", "just basic", "jscript", "julia",
                     "korn shell", "kotlin",
                     "labview", "ladder logic", "leet", "liberty basic", "lisp", "logo", "lua",
                     "m4", "machine", "machine language", "malbolge", "maple", "matlab", "m-code", "mercury", "ml",
                     "modula-2", "mondrian", "mql4", "msl",
                     "natural",
                     "oberon", "objective-c", "objectpal", "object pascal", "ocaml", "opencl", "openedge abl", "oz",
                     "pascal", "pawn", "perl", "php", "piet", "pl/1", "pl/i", "pl/sql", "pl/pgsql", "postscript",
                     "powerbasic", "powerbuilder", "powershell", "processing", "progress", "prolog", "providex",
                     "purebasic", "python",
                     "q#", "qbasic",
                     "r", "raku", "rexx", "ring", "rpg", "ruby", "rust",
                     "sas", "scala", "sed", "scheme", "scratch", "scratch jr.", "seed7", "self", "simula", "smalltalk",
                     "smallbasic", "snobol", "solidity", "spark", "spss", "sql", "stata", "swift",
                     "tcl", "tex", "ti-basic", "transact-sql", "t-sql", "turbobasic", "turbo c", "turbo pascal",
                     "typescript",
                     "ubasic",
                     "vala", "vala/genie", "vb", "vbs", "vbscript", "verilog", "vhdl", "visual basic", "visual c",
                     "visual foxpro", "visual objects", "vbscripts", "whitespace",
                     "xslt", "xquery",
                     "yaml"]
    language_list_col = array(*[lit(x) for x in language_list])
    platform_list = ["arthur", "arx", "mos", "risc-ix", "risc-os", "amigaos", "amigaos-1.0-3.9", "amigaos-4",
                     "amiga-unix", "amsdos", "contiki", "cp-m-2.2", "cp-m-plus", "symbos", "apple-ii", "apple-dos",
                     "apple-pascal", "prodos", "gs-os", "gno-me", "apple-iii", "apple-sos", "apple-lisa",
                     "apple-macintosh", "classic-mac-os", "a-ux", "copland", "mklinux", "pink", "rhapsody", "macos",
                     "macos-server", "apple-network-server", "ibm-aix", "apple-messagepad", "newton-os", "iphone",
                     "ios", "ipad", "ipados", "apple-watch", "watchos", "apple-tv", "tvos", "a-rose", "ipod-software",
                     "netbsd", "domain-os", "atari-dos", "atari-tos", "atari-multitos", "xts-400", "beos", "beia",
                     "beos-r5.1d0", "magnussoft-zeta", "unix", "unix-time-sharing-system-v6", "pwb-unix", "cb-unix",
                     "unix-time-sharing-system-v7", "unix-system-iii", "unix-system-v", "unix-time-sharing-system-v8",
                     "unix-time-sharing-system-v9", "unix-time-sharing-system-v10", "besys", "plan-9-from-bell-labs",
                     "inferno", "burroughs-mcp", "chippewa-operating-system", "kronos", "nos", "scope", "puffin-os",
                     "convergent-technologies-operating-system", "cromemco-dos", "cromix", "aos", "dg-ux", "rdos",
                     "datapoint-2200", "datapoint", "deos", "heartos", "cp-m", "personal-cp-m", "cp-m-68k", "cp-m-8000",
                     "cp-m-86", "cp-m-86-plus", "personal-cp-m-86", "mp-m", "mp-m-ii", "mp-m-86", "mp-m-8-16",
                     "concurrent-cp-m", "concurrent-cp-m-86", "concurrent-cp-m-8-16", "concurrent-cp-m-68k", "dos",
                     "concurrent-dos", "concurrent-pc-dos", "concurrent-dos-8-16", "concurrent-dos-286",
                     "concurrent-dos-xm", "concurrent-dos-386", "concurrent-dos-386-mge", "concurrent-dos-68k",
                     "flexos", "flexos-186", "flexos-286", "siemens-s5-dos-mt", "ibm-4680-os", "ibm-4690-os",
                     "toshiba-4690-os", "flexos-386", "flexos-68k", "multiuser-dos", "cci-multiuser-dos",
                     "datapac-multiuser-dos", "datapac-system-manager", "ims-multiuser-dos", "real-32", "real-ng",
                     "dos-plus", "dr-dos", "palmdos", "star-trek", "novell-dos", "opendos", "batch-11-dos-11", "hp-ux",
                     "multi-programming-executive", "nonstop", "os-8", "rsts-e", "rsx-11", "rt-11", "tops-10", "tenex",
                     "tops-20", "digital-unix", "ultrix", "vms", "waits", "ose", "towns-os", "os-iv", "msp", "msp-ex",
                     "real-time-multiprogramming-operating-system", "gcos", "multics", "chromium-os", "chrome-os",
                     "container-optimized-os", "android", "glinux", "fuchsia", "integrity", "integrity-178b",
                     "u-velosity", "vulcan-o-s", "harris-unix", "hdos", "ht-11", "hp-multi-programming-executive",
                     "nonstop-os", "cp-6", "harmony-os", "irmx", "isis", "compatible-time-sharing-system",
                     "gm-os-&-gm-naa-i-o", "ibsys", "ijmon", "share-operating-system",
                     "university-of-michigan-executive-system", "os-360-and-successors", "os-360", "mft", "mft-ii",
                     "mvt", "system-370", "os-vs1", "multiple-virtual-storage", "mvs-xa", "mvs-esa", "os-390",
                     "phoenix-mvs", "z-os", "dos-360-and-successors", "bos-360", "tos-360", "dos-360", "dos-vs",
                     "dos-vse", "vse-sp", "z-vse", "cp-cms", "cp-40", "cp-67", "vm-370", "vm-xa", "virtual-machine",
                     "z-vm", "acp", "tpf", "z-tpf", "unix-like", "aix-370", "aix-esa", "opensolaris-for-system-z",
                     "uts", "linux-on-ibm-z", "mts", "tss-360", "music-sp", "orvyl-and-wylbur", "pc-dos", "os-2",
                     "os-2-warp", "ecomstation", "arcaos", "aix", "ibm-series-1", "edx", "rps", "cps", "serix",
                     "ibm-1130", "dms", "ibm-1800", "tsx", "mpx", "ibm-8100", "dpcx", "dppx", "ibm-system-3",
                     "ibm-system-34", "ibm-system-38", "cpf", "ibm-system-88", "stratus-vos", "as-400", "os-400",
                     "i5-os", "ibm-i", "workplace-os", "k42", "dynix", "j", "george", "executive", "tme", "icl-vme",
                     "vme-k", "remix-os", "lynxos", "microc-os-ii", "microc-os-iii", "xenix", "msx-dos", "ms-dos",
                     "dos-v", "windows", "windows-1.0", "windows-2.0", "windows-3.0", "windows-3.1x",
                     "windows-for-workgroups-3.1", "windows-3.2", "windows-for-workgroups-3.11", "windows-95",
                     "windows-98", "windows-millennium-edition", "windows-nt", "windows-nt-3.1", "windows-nt-3.5",
                     "windows-nt-3.51", "windows-nt-4.0", "windows-2000", "windows-xp", "windows-server-2003",
                     "windows-fundamentals-for-legacy-pcs", "windows-vista", "windows-azure", "windows-home-server",
                     "windows-server-2008", "windows-7", "windows-phone-7", "windows-server-2008-r2",
                     "windows-home-server-2011", "windows-server-2012", "windows-8", "windows-phone-8", "windows-8.1",
                     "windows-phone-8.1", "windows-server-2012-r2", "xbox-one-system-software", "windows-10",
                     "windows-10-mobile", "windows-server-2016", "windows-server-2019", "windows-ce", "windows-ce-3.0",
                     "windows-ce-5.0", "windows-ce-6.0", "windows-embedded-compact-7", "windows-embedded-compact-2013",
                     "windows-mobile", "singularity", "midori", "xbox-360-system-software", "azure-sphere", "threadx",
                     "altair-dos", "mobilinux", "tmx", "imos", "vrx", "es", "nextstep", "netware", "unixware",
                     "novell-supernos", "novell-corsair", "novell-expose", "open-enterprise-server", "rtxc-quadros",
                     "time-sharing-operating-system", "dspnano-rtos", "bada", "tizen", "sco-unix", "sco-open-desktop",
                     "sco-openserver", "berkeley-timesharing-system", "pikeos", "trsdos", "color-basic", "newdos-80",
                     "deskmate", "edos", "ti-rtos-kernel", "tron", "t-kernel", "exec-i", "exec-ii", "exec-8", "vs-9",
                     "wps", "ois", "vxworks", "z80-rio", "zorin-os", "lisp-machines--inc.", "symbolics",
                     "texas-instruments", "xerox", "andos", "csi-dos", "mk-dos", "pilot", "perq", "elbrus", "eos",
                     "elxsi", "mai-basic-four", "michigan-terminal-system", "es-evm", "pc-mos-386", "buran",
                     "sintran-iii", "skyos", "soda", "theos", "tsx-32", "dx10", "aegis", "coherent", "dc-osx", "dnix",
                     "helios", "interactive-unix", "irix", "meikos", "os-9", "os-9000", "osf-1", "openstep", "qnx",
                     "rmx", "sinix", "solaris", "sunos", "super-ux", "system-v", "system-v-at--386", "trusted-solaris",
                     "uniflex", "unicos", "zenix", "minix", "bsd", "freebsd", "dragonflybsd", "midnightbsd", "ghostbsd",
                     "trueos", "openbsd", "bitrig", "darwin", "gnu", "linux", "redox", "android-x86",
                     "cray-linux-environment", "opensolaris", "illumos", "openindiana", "nexenta-os", "smartos",
                     "rtems", "haiku", "syllable-desktop", "vsta", "plurix", "tunis", "dahliaos", "cosmos", "freedos",
                     "genode", "ghost-os", "its", "osfree", "osv", "phantom-os", "reactos", "sharpos", "templeos",
                     "visopsys", "research-unix", "amoeba", "croquet", "eros", "capros", "harmony", "helenos", "house",
                     "ilios", "l4", "mach", "nemesis", "spring", "the-multiprogramming-system", "thoth", "v", "verve",
                     "xinu", "86-dos", "dr-dos-startrek", "dr-dos-winbolt", "pts-dos", "turbodos", "desqview",
                     "x-windowing", "banyan-vines", "cambridge-ring", "cisco-ios", "cisco-nx-os", "ctos", "data-ontap",
                     "extremeware", "extremexos", "fabric-os", "junos", "network-operating-system",
                     "novell-open-enterprise-server", "plan-9", "blis-cobol", "bluebottle", "bs1000", "bs2000",
                     "bs3000", "flex9", "gem", "geos", "javaos", "jnode", "jx", "kernal", "merlin", "morphos",
                     "fujitsu", "oberon-(operating-system)", "osd-xc", "pick", "primos", "sinclair-qdos", "ssb-dos",
                     "symobi", "tripos", "ucsd-p-system", "vos", "vos3", "vm2000", "visi-on", "vps-vm", "aros",
                     "atheos", "baremetal", "dexos", "emutos", "lse-os", "menuetos", "kolibrios", "toaruos", "ponyos",
                     "serenityos", "dip-dos", "embedded-linux", "replicant", "lineageos",
                     "list-of-custom-android-distributions", "firefox-os", "angstrom-distribution", "familiar-linux",
                     "maemo", "openzaurus", "webos", "access-linux-platform", "openmoko-linux", "ophone", "meego",
                     "moblin", "motomagx", "qt-extended", "sailfish-os", "ubuntu-touch", "postmarketos", "magic-cap",
                     "palm-os", "pen-geos", "penpoint-os", "pvos", "symbian-os", "epoc", "pocket-pc", "windows-phone",
                     "ipodlinux", "iriver-clix", "rockbox", "blackberry-os", "symbian-platform", "blackberry-10",
                     "catos", "ios-xr", "lancom-systems", "openwrt", "dd-wrt", "lede", "gargoyle", "librecmc",
                     "zeroshell", "rtos", "m0n0wall", "opnsense", "pfsense", "apache-mynewt", "chibios-rt",
                     "erika-enterprise", "ecos", "nucleus-rtos", "nuttx", "ncos", "freertos--openrtos-and-safertos",
                     "openembedded", "psos", "rex-os", "riot", "rom-dos", "tinyos", "rt-thread", "windows-iot",
                     "windows-embedded", "wombat-os", "zephyr", "brickos", "lejos", "cambridge-cap-computer",
                     "flex-machine", "hydra", "keykos"]  # generated from util/platform_list.rb
    platform_list_col = array(*[lit(x) for x in platform_list])

    df = spark.read.parquet("/user/***REMOVED***/StackOverflow/PostHistory.parquet") \
        .select(["_PostId", "_Text", '_PostHistoryTypeId']) \
        .filter(col("_PostHistoryTypeId") == 3) \
        .withColumn("_Tags", expr("substring(_Text, 2, length(_Text) - 2)")) \
        .withColumn("_Tags", split(col("_Tags"), "><")) \
        .withColumn("#tags", when(size("_Tags") < 0, 0).otherwise(size("_Tags"))) \
        .withColumn("contains_language_tag", size(array_intersect("_Tags", language_list_col)) > 0) \
        .withColumn("contains_platform_tag", size(array_intersect("_Tags", platform_list_col)) > 0) \
        .drop("_Tags", "_PostHistoryTypeId", "_Text") \
        .withColumnRenamed('_PostId', '_Id')

    return df
Ejemplo n.º 15
0
def get_tokens_match_over_diff(df_to_process):
    df_processed = df_to_process.withColumn(
        "score",
        F.size(F.array_intersect("token_filtered", "to_match")) /
        F.size("token_filtered"))
    return df_processed
Ejemplo n.º 16
0
df_mincityear_onw_cit=(
  df_ani
  .filter(sort_pub_year+' >= '+mincityear)
  .select(
    func.col('Eid').alias('CitingEid'),
    func.explode('citations').alias('Eid'),
    func.col('Au.auid').alias('CitingAuids')
  )
  .distinct()
  .join(
    df_ani.select(
      'Eid',
      func.col('Au.auid').alias('CitedAuids')
    ),["Eid"]
  )
  .withColumn('overLappingAuthors',func.size(func.array_intersect('CitingAuids','CitedAuids')))
  .select(
    "CitingEid",
    "Eid",
    func.expr("IF(overLappingAuthors>0,1,0)").alias('isSelfCitation'),
    func.expr("IF(overLappingAuthors>0,NULL,CitingEid)").alias('CitingEidNonSelf'),
  )
  .groupBy('Eid')
  .agg(
    func.count('*').alias('CitationCount'),
    func.sum('isSelfCitation').alias('SelfCitationCount'),
    (func.count('*')-func.sum('isSelfCitation')).alias('CitationCountNonSelf'),
    func.collect_list('CitingEid').alias('CitingEids'),
    func.collect_list('CitingEidNonSelf').alias('CitingEidsNonSelf'),
  )
)
Ejemplo n.º 17
0
    def insertDataToGraph(self):
        spark = self.sparkSession
        neo4j = self.neo4jDriver.session()
        sc = spark.sparkContext
        feats = self.user_features_df
        list_cat = self.list_categories
        cat_count = len(list_cat)

        #import edges
        e = self.edges_df
        self.nodes_df = e.select("Source").union(
            e.select("Target")).distinct().withColumnRenamed('Source', 'id')
        n = self.nodes_df
        print(feats.count(), list_cat, e.count(), n.count())
        feats.printSchema()

        #cache dataframes
        feats.cache()
        e.cache()
        n.cache()

        #add category property to u
        u_focus_rate = feats.select(
            col('id'),
            col("user_features{}.dict_focus_rate".format((
                "_" + self.method_name
            ) if len(self.method_name) > 0 else "")).alias("dict_focus_rate"))
        u_with_category = u_focus_rate.withColumn(
            "userCategory", array([lit(c) for c in list_cat]))
        for cat in list_cat:
            u_with_category = u_with_category.withColumn(
                "temp",
                when(
                    col("dict_focus_rate.{}".format(cat)) < 1 / cat_count,
                    array_remove(u_with_category["userCategory"],
                                 cat)).otherwise(
                                     u_with_category["userCategory"])).drop(
                                         "userCategory").withColumnRenamed(
                                             "temp", "userCategory")
        u_with_category = u_with_category.select("id", "userCategory")

        #join n and u_with_category
        n_with_category = n.join(u_with_category, "id", how="left")

        #add category columns to e
        e_with_category = e.join(n_with_category,
                                 e.Source == n_with_category.id,
                                 how="left").withColumnRenamed(
                                     "userCategory", "sourceCategory").select(
                                         "Source", "Target", "sourceCategory")
        e_with_category = e_with_category.join(
            n_with_category,
            e_with_category.Target == n_with_category.id,
            how="left").withColumnRenamed("userCategory",
                                          "targetCategory").select(
                                              "Source", "Target",
                                              "sourceCategory",
                                              "targetCategory")

        #determine intersection between sourceCategory and targetCategory
        e_with_category = e_with_category.withColumn(
            "Categories",
            array_intersect(e_with_category["sourceCategory"],
                            e_with_category["targetCategory"]))

        #flatten out categories of edges
        e_with_category = e_with_category.withColumn(
            "Category",
            explode(col("Categories"))).select("Source", "Target", "Category")
        print("e_with_category", e_with_category.count())
        e_with_category.printSchema()

        ## Insert data
        insert_query = '''
        UNWIND {triples} as triple
        MERGE (p1:User {id:triple[0]})
        MERGE (p2:User {id:triple[1]}) WITH p1,p2,triple
        CALL apoc.create.relationship(p1, triple[2], {}, p2) YIELD rel
        RETURN *
        '''

        e_listoftriples = e_with_category.toPandas()[[
            'Source', 'Target', 'Category'
        ]].values.tolist()
        print("e_listoftriples:", len(e_listoftriples))
        batches = list(self.generate_batches(e_listoftriples, 7000))
        for batch in batches:
            neo4j.run(insert_query, parameters={"triples": batch})

        e_with_category.show()
        print("batches size:", len(batches), " last batch:", len(batches[-1]))