Example #1
0
def machinelearnig():
    data1 = sc.textFile("/user/SentimentalData/data1.csv")
    spark = getSparkSessionInstance(data1.context.getConf())

    r1 = data1.mapPartitions(lambda x : csv.reader(x)).map(lambda x: [x[3], x[1]])
    rddr1 = r1.map(lambda x: [x[0].replace(',',' '),x[1]]).map(lambda x: [x[0].replace('/',' '),x[1]]).map(lambda x: [x[0].replace('?',' '),x[1]])
    rddr1 = rddr1.map(lambda x: [x[0].replace(':',' '),x[1]]).map(lambda x: [x[0].replace('-',' '),x[1]]).map(lambda x: [x[0].replace('.',' '),x[1]])
    rddr1 = rddr1.map(lambda x: [x[0].replace(')',' '),x[1]]).map(lambda x: [x[0].replace('(',' '),x[1]]).map(lambda x: [x[0].replace('!',' '),x[1]])
    rddr1 = rddr1.map(lambda sn: [' '.join(filter(lambda x: x.startswith(('@','http','"','&','rt')) == False, sn[0].split() )), sn[1]] )
    rddr1 = rddr1.map(lambda sn: [' '.join(filter(lambda x: (x.endswith(('"')) == False) and len(x) > 2 , sn[0].split() )), sn[1]] )
    rddr1 = rddr1.map(lambda sn: [' '.join(filter(lambda x: "a's" != x and x != "able" and x != "about" and x != "above" and x != "according" and x != "accordingly" and x != "across" and x != "actually" and x != "after" and x != "afterwards" and x != "again" and x != "against" and x != "ain't" and x != "all" and x != "allow" and x != "allows" and x != "almost" and x != "alone" and x != "along" and x != "already" and x != "also" and x != "although" and x != "always" and x != "am" and x != "among" and x != "amongst" and x != "an" and x != "and" and x != "another" and x != "any" and x != "anybody" and x != "anyhow" and x != "anyone" and x != "anything" and x != "anyway" and x != "anyways" and x != "anywhere" and x != "apart" and x != "appear" and x != "appreciate" and x != "appropriate" and x != "are" and x != "aren't" and x != "around" and x != "as" and x != "aside" and x != "ask" and x != "asking" and x != "associated" and x != "at" and x != "available" and x != "away" and x != "awfully" and x != "b" and x != "be" and x != "became" and x != "because" and x != "become" and x != "becomes" and x != "becoming" and x != "been" and x != "before" and x != "beforehand" and x != "behind" and x != "being" and x != "believe" and x != "below" and x != "beside" and x != "besides" and x != "best" and x != "better" and x != "between" and x != "beyond" and x != "both" and x != "brief" and x != "but" and x != "by" and x != "c" and x != "c'mon" and x != "c's" and x != "came" and x != "can" and x != "can't" and x != "cannot" and x != "cant" and x != "cause" and x != "causes" and x != "certain" and x != "certainly" and x != "changes" and x != "clearly" and x != "co" and x != "com" and x != "come" and x != "comes" and x != "concerning" and x != "consequently" and x != "consider" and x != "considering" and x != "contain" and x != "containing" and x != "contains" and x != "corresponding" and x != "could" and x != "couldn't" and x != "course" and x != "currently" and x != "d" and x != "definitely" and x != "described" and x != "despite" and x != "did" and x != "didn't" and x != "different" and x != "do" and x != "does" and x != "doesn't" and x != "doing" and x != "don't" and x != "done" and x != "down" and x != "downwards" and x != "during" and x != "e" and x != "each" and x != "edu" and x != "eg" and x != "eight" and x != "either" and x != "else" and x != "elsewhere" and x != "enough" and x != "entirely" and x != "especially" and x != "et" and x != "etc" and x != "even" and x != "ever" and x != "every" and x != "everybody" and x != "everyone" and x != "everything" and x != "everywhere" and x != "ex" and x != "exactly" and x != "example" and x != "except" and x != "f" and x != "far" and x != "few" and x != "fifth" and x != "first" and x != "five" and x != "followed" and x != "following" and x != "follows" and x != "for" and x != "former" and x != "formerly" and x != "forth" and x != "four" and x != "from" and x != "further" and x != "furthermore" and x != "g" and x != "get" and x != "gets" and x != "getting" and x != "given" and x != "gives" and x != "go" and x != "goes" and x != "going" and x != "gone" and x != "got" and x != "gotten" and x != "greetings" and x != "h" and x != "had" and x != "hadn't" and x != "happens" and x != "hardly" and x != "has" and x != "hasn't" and x != "have" and x != "haven't" and x != "having" and x != "he" and x != "he's" and x != "hello" and x != "help" and x != "hence" and x != "her" and x != "here" and x != "here's" and x != "hereafter" and x != "hereby" and x != "herein" and x != "hereupon" and x != "hers" and x != "herself" and x != "hi" and x != "him" and x != "himself" and x != "his" and x != "hither" and x != "hopefully" and x != "how" and x != "howbeit" and x != "however" and x != "i" and x != "i'd" and x != "i'll" and x != "i'm" and x != "i've" and x != "ie" and x != "if" and x != "ignored" and x != "immediate" and x != "in" and x != "inasmuch" and x != "inc" and x != "indeed" and x != "indicate" and x != "indicated" and x != "indicates" and x != "inner" and x != "insofar" and x != "instead" and x != "into" and x != "inward" and x != "is" and x != "isn't" and x != "it" and x != "it'd" and x != "it'll" and x != "it's" and x != "its" and x != "itself" and x != "j" and x != "just" and x != "k" and x != "keep" and x != "keeps" and x != "kept" and x != "know" and x != "known" and x != "knows" and x != "l" and x != "last" and x != "lately" and x != "later" and x != "latter" and x != "latterly" and x != "least" and x != "less" and x != "lest" and x != "let" and x != "let's" and x != "like" and x != "liked" and x != "likely" and x != "little" and x != "look" and x != "looking" and x != "looks" and x != "ltd" and x != "m" and x != "mainly" and x != "many" and x != "may" and x != "maybe" and x != "me" and x != "mean" and x != "meanwhile" and x != "merely" and x != "might" and x != "more" and x != "moreover" and x != "most" and x != "mostly" and x != "much" and x != "must" and x != "my" and x != "myself" and x != "n" and x != "name" and x != "namely" and x != "nd" and x != "near" and x != "nearly" and x != "necessary" and x != "need" and x != "needs" and x != "neither" and x != "never" and x != "nevertheless" and x != "new" and x != "next" and x != "nine" and x != "no" and x != "nobody" and x != "non" and x != "none" and x != "noone" and x != "nor" and x != "normally" and x != "not" and x != "nothing" and x != "novel" and x != "now" and x != "nowhere" and x != "o" and x != "obviously" and x != "of" and x != "off" and x != "often" and x != "oh" and x != "ok" and x != "okay" and x != "old" and x != "on" and x != "once" and x != "one" and x != "ones" and x != "only" and x != "onto" and x != "or" and x != "other" and x != "others" and x != "otherwise" and x != "ought" and x != "our" and x != "ours" and x != "ourselves" and x != "out" and x != "outside" and x != "over" and x != "overall" and x != "own" and x != "p" and x != "particular" and x != "particularly" and x != "per" and x != "perhaps" and x != "placed" and x != "please" and x != "plus" and x != "possible" and x != "presumably" and x != "probably" and x != "provides" and x != "q" and x != "que" and x != "quite" and x != "qv" and x != "r" and x != "rather" and x != "rd" and x != "re" and x != "really" and x != "reasonably" and x != "regarding" and x != "regardless" and x != "regards" and x != "relatively" and x != "respectively" and x != "right" and x != "s" and x != "said" and x != "same" and x != "saw" and x != "say" and x != "saying" and x != "says" and x != "second" and x != "secondly" and x != "see" and x != "seeing" and x != "seem" and x != "seemed" and x != "seeming" and x != "seems" and x != "seen" and x != "self" and x != "selves" and x != "sensible" and x != "sent" and x != "serious" and x != "seriously" and x != "seven" and x != "several" and x != "shall" and x != "she" and x != "should" and x != "shouldn't" and x != "since" and x != "six" and x != "so" and x != "some" and x != "somebody" and x != "somehow" and x != "someone" and x != "something" and x != "sometime" and x != "sometimes" and x != "somewhat" and x != "somewhere" and x != "soon" and x != "sorry" and x != "specified" and x != "specify" and x != "specifying" and x != "still" and x != "sub" and x != "such" and x != "sup" and x != "sure" and x != "t" and x != "t's" and x != "take" and x != "taken" and x != "tell" and x != "tends" and x != "th" and x != "than" and x != "thank" and x != "thanks" and x != "thanx" and x != "that" and x != "that's" and x != "thats" and x != "the" and x != "their" and x != "theirs" and x != "them" and x != "themselves" and x != "then" and x != "thence" and x != "there" and x != "there's" and x != "thereafter" and x != "thereby" and x != "therefore" and x != "therein" and x != "theres" and x != "thereupon" and x != "these" and x != "they" and x != "they'd" and x != "they'll" and x != "they're" and x != "they've" and x != "think" and x != "third" and x != "this" and x != "thorough" and x != "thoroughly" and x != "those" and x != "though" and x != "three" and x != "through" and x != "throughout" and x != "thru" and x != "thus" and x != "to" and x != "together" and x != "too" and x != "took" and x != "toward" and x != "towards" and x != "tried" and x != "tries" and x != "truly" and x != "try" and x != "trying" and x != "twice" and x != "two" and x != "u" and x != "un" and x != "under" and x != "unfortunately" and x != "unless" and x != "unlikely" and x != "until" and x != "unto" and x != "up" and x != "upon" and x != "us" and x != "use" and x != "used" and x != "useful" and x != "uses" and x != "using" and x != "usually" and x != "uucp" and x != "v" and x != "value" and x != "various" and x != "very" and x != "via" and x != "viz" and x != "vs" and x != "w" and x != "want" and x != "wants" and x != "was" and x != "wasn't" and x != "way" and x != "we" and x != "we'd" and x != "we'll" and x != "we're" and x != "we've" and x != "welcome" and x != "well" and x != "went" and x != "were" and x != "weren't" and x != "what" and x != "what's" and x != "whatever" and x != "when" and x != "whence" and x != "whenever" and x != "where" and x != "where's" and x != "whereafter" and x != "whereas" and x != "whereby" and x != "wherein" and x != "whereupon" and x != "wherever" and x != "whether" and x != "which" and x != "while" and x != "whither" and x != "who" and x != "who's" and x != "whoever" and x != "whole" and x != "whom" and x != "whose" and x != "why" and x != "will" and x != "willing" and x != "wish" and x != "with" and x != "within" and x != "without" and x != "won't" and x != "wonder" and x != "would" and x != "wouldn't" and x != "x" and x != "y" and x != "yes" and x != "yet" and x != "you" and x != "you'd" and x != "you'll" and x != "you're" and x != "you've" and x != "your" and x != "yours" and x != "yourself" and x != "yourselves" and x != "z" and x != "zero" , sn[0].split() )), sn[1]] )

    partsDF1 = spark.createDataFrame( rddr1.map(lambda x : Row(sentence=str.strip(x[0]), label=int(x[1]))) )
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    tokenized1 = tokenizer.transform(partsDF1)
    remover = StopWordsRemover(inputCol="words", outputCol="base_words")
    base_words1 = remover.transform(tokenized1)
    train_data_raw1 = base_words1.select("base_words", "label")
    word2Vec1 = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features")
    model1 = word2Vec1.fit(train_data_raw1)
    final_train_data1 = model1.transform(train_data_raw1)
    final_train_data1 = final_train_data1.select("label", "features")
      
    lr = LogisticRegression(maxIter=2000, regParam=0.001, elasticNetParam=0.0001)
    lrModel = lr.fit(final_train_data1)
    lrModel.transform(final_train_data1)
    
    data2 = sc.textFile("/user/ml/data2.csv")
    r2 = data1.mapPartitions(lambda x : csv.reader(x))

    r2 = data2.mapPartitions(lambda x : csv.reader(x)).map(lambda x: [x[3], x[1]])
    rddr2 = r2.map(lambda x: [x[0].replace(',',' '),x[1]]).map(lambda x: [x[0].replace('/',' '),x[1]]).map(lambda x: [x[0].replace('?',' '),x[1]])
    rddr2 = rddr2.map(lambda x: [x[0].replace(':',' '),x[1]]).map(lambda x: [x[0].replace('-',' '),x[1]]).map(lambda x: [x[0].replace('.',' '),x[1]])
    rddr2 = rddr2.map(lambda x: [x[0].replace(')',' '),x[1]]).map(lambda x: [x[0].replace('(',' '),x[1]]).map(lambda x: [x[0].replace('!',' '),x[1]])
    rddr2 = rddr2.map(lambda sn: [' '.join(filter(lambda x: x.startswith(('@','http','"','&','rt')) == False, sn[0].split() )), sn[1]] )
    rddr2 = rddr2.map(lambda sn: [' '.join(filter(lambda x: (x.endswith(('"')) == False) and len(x) > 2 , sn[0].split() )), sn[1]] )
    rddr2 = rddr2.map(lambda sn: [' '.join(filter(lambda x: "a's" != x and x != "able" and x != "about" and x != "above" and x != "according" and x != "accordingly" and x != "across" and x != "actually" and x != "after" and x != "afterwards" and x != "again" and x != "against" and x != "ain't" and x != "all" and x != "allow" and x != "allows" and x != "almost" and x != "alone" and x != "along" and x != "already" and x != "also" and x != "although" and x != "always" and x != "am" and x != "among" and x != "amongst" and x != "an" and x != "and" and x != "another" and x != "any" and x != "anybody" and x != "anyhow" and x != "anyone" and x != "anything" and x != "anyway" and x != "anyways" and x != "anywhere" and x != "apart" and x != "appear" and x != "appreciate" and x != "appropriate" and x != "are" and x != "aren't" and x != "around" and x != "as" and x != "aside" and x != "ask" and x != "asking" and x != "associated" and x != "at" and x != "available" and x != "away" and x != "awfully" and x != "b" and x != "be" and x != "became" and x != "because" and x != "become" and x != "becomes" and x != "becoming" and x != "been" and x != "before" and x != "beforehand" and x != "behind" and x != "being" and x != "believe" and x != "below" and x != "beside" and x != "besides" and x != "best" and x != "better" and x != "between" and x != "beyond" and x != "both" and x != "brief" and x != "but" and x != "by" and x != "c" and x != "c'mon" and x != "c's" and x != "came" and x != "can" and x != "can't" and x != "cannot" and x != "cant" and x != "cause" and x != "causes" and x != "certain" and x != "certainly" and x != "changes" and x != "clearly" and x != "co" and x != "com" and x != "come" and x != "comes" and x != "concerning" and x != "consequently" and x != "consider" and x != "considering" and x != "contain" and x != "containing" and x != "contains" and x != "corresponding" and x != "could" and x != "couldn't" and x != "course" and x != "currently" and x != "d" and x != "definitely" and x != "described" and x != "despite" and x != "did" and x != "didn't" and x != "different" and x != "do" and x != "does" and x != "doesn't" and x != "doing" and x != "don't" and x != "done" and x != "down" and x != "downwards" and x != "during" and x != "e" and x != "each" and x != "edu" and x != "eg" and x != "eight" and x != "either" and x != "else" and x != "elsewhere" and x != "enough" and x != "entirely" and x != "especially" and x != "et" and x != "etc" and x != "even" and x != "ever" and x != "every" and x != "everybody" and x != "everyone" and x != "everything" and x != "everywhere" and x != "ex" and x != "exactly" and x != "example" and x != "except" and x != "f" and x != "far" and x != "few" and x != "fifth" and x != "first" and x != "five" and x != "followed" and x != "following" and x != "follows" and x != "for" and x != "former" and x != "formerly" and x != "forth" and x != "four" and x != "from" and x != "further" and x != "furthermore" and x != "g" and x != "get" and x != "gets" and x != "getting" and x != "given" and x != "gives" and x != "go" and x != "goes" and x != "going" and x != "gone" and x != "got" and x != "gotten" and x != "greetings" and x != "h" and x != "had" and x != "hadn't" and x != "happens" and x != "hardly" and x != "has" and x != "hasn't" and x != "have" and x != "haven't" and x != "having" and x != "he" and x != "he's" and x != "hello" and x != "help" and x != "hence" and x != "her" and x != "here" and x != "here's" and x != "hereafter" and x != "hereby" and x != "herein" and x != "hereupon" and x != "hers" and x != "herself" and x != "hi" and x != "him" and x != "himself" and x != "his" and x != "hither" and x != "hopefully" and x != "how" and x != "howbeit" and x != "however" and x != "i" and x != "i'd" and x != "i'll" and x != "i'm" and x != "i've" and x != "ie" and x != "if" and x != "ignored" and x != "immediate" and x != "in" and x != "inasmuch" and x != "inc" and x != "indeed" and x != "indicate" and x != "indicated" and x != "indicates" and x != "inner" and x != "insofar" and x != "instead" and x != "into" and x != "inward" and x != "is" and x != "isn't" and x != "it" and x != "it'd" and x != "it'll" and x != "it's" and x != "its" and x != "itself" and x != "j" and x != "just" and x != "k" and x != "keep" and x != "keeps" and x != "kept" and x != "know" and x != "known" and x != "knows" and x != "l" and x != "last" and x != "lately" and x != "later" and x != "latter" and x != "latterly" and x != "least" and x != "less" and x != "lest" and x != "let" and x != "let's" and x != "like" and x != "liked" and x != "likely" and x != "little" and x != "look" and x != "looking" and x != "looks" and x != "ltd" and x != "m" and x != "mainly" and x != "many" and x != "may" and x != "maybe" and x != "me" and x != "mean" and x != "meanwhile" and x != "merely" and x != "might" and x != "more" and x != "moreover" and x != "most" and x != "mostly" and x != "much" and x != "must" and x != "my" and x != "myself" and x != "n" and x != "name" and x != "namely" and x != "nd" and x != "near" and x != "nearly" and x != "necessary" and x != "need" and x != "needs" and x != "neither" and x != "never" and x != "nevertheless" and x != "new" and x != "next" and x != "nine" and x != "no" and x != "nobody" and x != "non" and x != "none" and x != "noone" and x != "nor" and x != "normally" and x != "not" and x != "nothing" and x != "novel" and x != "now" and x != "nowhere" and x != "o" and x != "obviously" and x != "of" and x != "off" and x != "often" and x != "oh" and x != "ok" and x != "okay" and x != "old" and x != "on" and x != "once" and x != "one" and x != "ones" and x != "only" and x != "onto" and x != "or" and x != "other" and x != "others" and x != "otherwise" and x != "ought" and x != "our" and x != "ours" and x != "ourselves" and x != "out" and x != "outside" and x != "over" and x != "overall" and x != "own" and x != "p" and x != "particular" and x != "particularly" and x != "per" and x != "perhaps" and x != "placed" and x != "please" and x != "plus" and x != "possible" and x != "presumably" and x != "probably" and x != "provides" and x != "q" and x != "que" and x != "quite" and x != "qv" and x != "r" and x != "rather" and x != "rd" and x != "re" and x != "really" and x != "reasonably" and x != "regarding" and x != "regardless" and x != "regards" and x != "relatively" and x != "respectively" and x != "right" and x != "s" and x != "said" and x != "same" and x != "saw" and x != "say" and x != "saying" and x != "says" and x != "second" and x != "secondly" and x != "see" and x != "seeing" and x != "seem" and x != "seemed" and x != "seeming" and x != "seems" and x != "seen" and x != "self" and x != "selves" and x != "sensible" and x != "sent" and x != "serious" and x != "seriously" and x != "seven" and x != "several" and x != "shall" and x != "she" and x != "should" and x != "shouldn't" and x != "since" and x != "six" and x != "so" and x != "some" and x != "somebody" and x != "somehow" and x != "someone" and x != "something" and x != "sometime" and x != "sometimes" and x != "somewhat" and x != "somewhere" and x != "soon" and x != "sorry" and x != "specified" and x != "specify" and x != "specifying" and x != "still" and x != "sub" and x != "such" and x != "sup" and x != "sure" and x != "t" and x != "t's" and x != "take" and x != "taken" and x != "tell" and x != "tends" and x != "th" and x != "than" and x != "thank" and x != "thanks" and x != "thanx" and x != "that" and x != "that's" and x != "thats" and x != "the" and x != "their" and x != "theirs" and x != "them" and x != "themselves" and x != "then" and x != "thence" and x != "there" and x != "there's" and x != "thereafter" and x != "thereby" and x != "therefore" and x != "therein" and x != "theres" and x != "thereupon" and x != "these" and x != "they" and x != "they'd" and x != "they'll" and x != "they're" and x != "they've" and x != "think" and x != "third" and x != "this" and x != "thorough" and x != "thoroughly" and x != "those" and x != "though" and x != "three" and x != "through" and x != "throughout" and x != "thru" and x != "thus" and x != "to" and x != "together" and x != "too" and x != "took" and x != "toward" and x != "towards" and x != "tried" and x != "tries" and x != "truly" and x != "try" and x != "trying" and x != "twice" and x != "two" and x != "u" and x != "un" and x != "under" and x != "unfortunately" and x != "unless" and x != "unlikely" and x != "until" and x != "unto" and x != "up" and x != "upon" and x != "us" and x != "use" and x != "used" and x != "useful" and x != "uses" and x != "using" and x != "usually" and x != "uucp" and x != "v" and x != "value" and x != "various" and x != "very" and x != "via" and x != "viz" and x != "vs" and x != "w" and x != "want" and x != "wants" and x != "was" and x != "wasn't" and x != "way" and x != "we" and x != "we'd" and x != "we'll" and x != "we're" and x != "we've" and x != "welcome" and x != "well" and x != "went" and x != "were" and x != "weren't" and x != "what" and x != "what's" and x != "whatever" and x != "when" and x != "whence" and x != "whenever" and x != "where" and x != "where's" and x != "whereafter" and x != "whereas" and x != "whereby" and x != "wherein" and x != "whereupon" and x != "wherever" and x != "whether" and x != "which" and x != "while" and x != "whither" and x != "who" and x != "who's" and x != "whoever" and x != "whole" and x != "whom" and x != "whose" and x != "why" and x != "will" and x != "willing" and x != "wish" and x != "with" and x != "within" and x != "without" and x != "won't" and x != "wonder" and x != "would" and x != "wouldn't" and x != "x" and x != "y" and x != "yes" and x != "yet" and x != "you" and x != "you'd" and x != "you'll" and x != "you're" and x != "you've" and x != "your" and x != "yours" and x != "yourself" and x != "yourselves" and x != "z" and x != "zero" , sn[0].split() )), sn[1]] )
    
    partsDF2 = spark.createDataFrame( rddr2.map(lambda x : Row(sentence=str.strip(x[0]), label=int(x[1]))) )
    tokenized2 = tokenizer.transform(partsDF2)
    base_words2 = remover.transform(tokenized2)
    train_data_raw2 = base_words2.select("base_words", "label")
    word2Vec2 = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features")
    model2 = word2Vec2.fit(train_data_raw2)
    final_train_data2 = model2.transform(train_data_raw2)
    final_train_data2 = final_train_data2.select("label", "features")

    lrModel.transform(final_train_data2)
    
    return lrModel
Example #2
0
def main():
    spark = SparkSession.builder \
        .appName("Spark CV-job ad matching") \
        .config("spark.some.config.option", "some-value") \
        .master("local[*]") \
        .getOrCreate()

    VECTOR_SIZE = 50

    df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache()
    df_jobs.registerTempTable("jobs")
    df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache()
    df_cvs.registerTempTable("cvs")
    df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache()
    df_categories.registerTempTable("categories")

    joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \
               SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \
               SELECT skillText AS text, id AS id, 'categories' AS type FROM categories")

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokenized = tokenizer.transform(joined)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    removed = remover.transform(tokenized)

    word2Vec = Word2Vec(vectorSize=VECTOR_SIZE, minCount=0, inputCol="filtered", outputCol="vectors")
    model = word2Vec.fit(removed)
    resultDF = model.transform(removed)

    normalizer = Normalizer(inputCol="vectors", outputCol="result", p=2)
    l1NormData = normalizer.transform(resultDF)

    l1NormData.registerTempTable("resultTable")
    jobs = spark.sql("SELECT result AS jobsVec, id AS jobId FROM resultTable WHERE type = 'job'")
    cvs = spark.sql("SELECT result AS cvsVec, id AS cvid FROM resultTable WHERE type = 'cv'")
    categories = spark.sql("SELECT result AS categoriesVec, cat.id, cat.skillName, category FROM resultTable AS rt\
    LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'")

    #Calculate job-cv similarity START
    crossJoined_job_cv = jobs.crossJoin(cvs)
    calculated_job_cv = crossJoined_job_cv.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.jobsVec, x.cvsVec)))\
    .toDF(["jobid", "cvid", "distance"]).orderBy(asc("jobid")).coalesce(2)
    calculated_job_cv.write.csv('Calculated/word2vec2/job-cv')
    #Calculate job-cv similarity END

    #Calculate cv-category similarity START
    crossJoined_cv_cat = cvs.crossJoin(categories)
    calculated_cv_cat = crossJoined_cv_cat.rdd.map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.cvsVec, x.categoriesVec)))\
    .toDF(["cvid", "category_id", "skillName", "category", "distance"]).orderBy(asc("cvid"), asc("distance")).coalesce(2)
    calculated_cv_cat.write.csv('Calculated/word2vec2/cv-category')
    #Calculate cv-category similarity END

    #Job-category START
    crossJoined_job_cat = jobs.select("jobId", "jobsVec").crossJoin(categories.select("id", "skillName", "category", "categoriesVec"))
    calculatedDF_job_cat = crossJoined_job_cat.rdd\
    .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.jobsVec, x.categoriesVec)))\
    .toDF(["jobid", "catid", "skillName", "category", "distance"])
    ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2)
    ordered_job_cat.write.csv('Calculated/word2vec2/job-category')
Example #3
0
def trainItem2vec(dataset, filename, saveToRedis=False, redisKeyPrefix=None):
    '''
    训练产生embedding,inputCol需要是 array(string)类型
    训练好后写入 filename
    :param dataset:
    :return:
    '''
    word2vec = Word2Vec(vectorSize=10,
                        windowSize=5,
                        maxIter=10,
                        inputCol='movieIds')
    model = word2vec.fit(dataset)
    print('model fitted')
    # 打印相似电影,基于点积运算
    synonyms = model.findSynonymsArray('158', 20)
    for moveid, similarity in synonyms:
        print('{}:{}'.format(moveid, similarity))

    with open('./modeldata/{}'.format(filename), 'w') as f:
        for row in model.getVectors().collect():
            tmp = ','.join([str(vector) for vector in row['vector']])
            f.write('{}:{}\n'.format(row['word'], tmp))

    # redis-cli eval "redis.call('del', unpack(redis.call('keys','*')))" 0 windows批量删除key
    if saveToRedis:
        pool = redis.ConnectionPool(host=HOST, port=PORT)
        # key的存活时间 秒
        ex = 60 * 10
        r = redis.Redis(connection_pool=pool)
        for i, row in enumerate(model.getVectors().collect()):
            tmp = ','.join([str(vector) for vector in row['vector']])
            if i == 1:
                print(type(row['vector']))
            r.set('{}:{}'.format(redisKeyPrefix, row['word']), tmp, ex)
    return model
Example #4
0
def analyzeSent(host, port, db_name, ALL_WEB_DATA):
    '''
    对拆分的句子进行聚类分析
    :return:
    '''
    sen = senSplit(host, port, db_name, ALL_WEB_DATA)
    df = spark.createDataFrame(sen)

    def tokenizer(row):
        '''
        文档映射处理
        :param row:
        :return:
        '''
        result = list()
        row_sp = ''.join(row.sent.split())
        result.append(row_sp)
        return [result]

    rdd = df.selectExpr('_1 as sent').rdd.map(tokenizer)
    df = spark.createDataFrame(rdd, ['sent'])
    wv_df = Word2Vec(vectorSize=5,
                     minCount=0,
                     inputCol="sent",
                     outputCol="features")
    model_wv = wv_df.fit(df)
    wv_df = model_wv.transform(df)
    # model_wv.getVectors().show(truncate=False)
    km = KMeans(featuresCol="features", k=5)
    model_km = km.fit(wv_df)
    df_km = model_km.transform(wv_df)
    df_km.select('sent', 'prediction').show()
    df_km.select('sent', 'prediction').show(truncate=False)
Example #5
0
    def transform(self):
        word2Vec = Word2Vec(vectorSize=self._vector_size, inputCol="content", outputCol="features")
        model = word2Vec.fit(self._mapped_data)

        w2v_data = model.transform(self._mapped_data)

        return w2v_data.drop('content')
Example #6
0
def Main():
    spark = SparkSession.builder\
        .appName("Word2Vec")\
        .config("spark.driver.cores", "8")\
        .config("spark.driver.maxResultSize", "13312m")\
        .config("spark.driver.memory", "26624m")\
        .config("spark.executor.cores", "8")\
        .config("spark.executor.memory", "37237m")\
        .getOrCreate()

    FILE_NO = 7
    total_df = []
    for idx in range(FILE_NO):
        each_df = spark.read.format("json") \
            .option("mode", "FAILFAST") \
            .option("inferSchema", "true") \
            .load("gs://dataproc-7e10897a-5391-4ea0-b815-f6e72cf284f7-asia-east1/data/contents/data.{}".format(idx))
        total_df.append(each_df)

    df = reduce(DataFrame.unionAll, total_df)
    df = df.select(df.id, explode(df.morphs).alias("words"))

    word2vec = Word2Vec(vectorSize=300,
                        minCount=0,
                        windowSize=2,
                        numPartitions=10,
                        inputCol="words",
                        outputCol="vector")
    model = word2vec.fit(df)
    df = model.transform(df)
    final = df.groupBy("id") \
        .agg(collect_list(struct("vector")).alias("matrix"))

    final.show(1)
Example #7
0
def Doc2vec(args, train_data, vector_size=10, window_size=5, input_col='reference_list'):
        word2Vec = Word2Vec(vectorSize=vector_size, minCount=2, seed=42, maxIter=1, windowSize=window_size,
                            inputCol=input_col,
                            outputCol="ref_vec")
        model = word2Vec.fit(train_data)
        model_name = "{}_{}_{}_{}".format(args.model_name, vector_size, window_size, args.is_item)
        model.write().overwrite().save(os.path.join(model_pth, model_name))
        vec = model.getVectors()

        train_data = model.transform(train_data)
        train_data = train_data.join(vec, train_data.impression == vec.word, how='left')
        train_data = train_data.withColumn('score', getCosinDis("ref_vec", 'vector'))
        train_data = train_data.withColumn("row_number",
                                           F.rank().over(
                                                   Window.partitionBy(GR_COLS).orderBy(train_data["score"].desc())))

        data = train_data.select(GR_COLS + ['action_type', 'impressions', 'reference',
                                            'score', 'row_number', 'impression']).filter("reference==impression")

        demo_cnt = data.cube("row_number").count().toPandas()
        demo_cnt = demo_cnt.dropna()
        # demo_cnt.toPandas(".csv".format(model_name))
        MRR = ((demo_cnt['count'] / demo_cnt['count'].sum()) * (1. / (26.0 - demo_cnt['row_number']))).sum()
        print('**---**' * 20)
        print("InputCol:{} vector_size:{} window_size:{} MRR:{}".format(input_col, vector_size, window_size, MRR))

        return MRR
    def word2vec(self):
        from pyspark.ml.feature import Word2Vec

        documentDF = self.session.createDataFrame(
            [("Hi I heard about Spark".split(" "), ),
             ("I wish Java could use case classes".split(" "), ),
             ("Logistic regression models are neat".split(" "), )], ["text"])

        word2Vec = Word2Vec(vectorSize=3,
                            minCount=0,
                            inputCol="text",
                            outputCol="result")
        model = word2Vec.fit(documentDF)

        # transform 其实只是做了个词向量求平均
        result = model.transform(documentDF)
        for row in result.collect():
            text, vector = row
            print("Text: [%s] => \nVector: %s\n" %
                  (", ".join(text), str(vector)))

        # 如果我希望把向量都拿出,以后在用呢?
        res = dict([(item["word"], item["vector"].toArray())
                    for item in model.getVectors().collect()])
        print(res["heard"])
Example #9
0
 def compute_word2vec(self,
                      input_df,
                      output_vec_len,
                      window_size=5,
                      sub_test=False):
     """
     Compute the word2vec for a given dataframe
     @param input_df       : the dataframe to perform the action upon
     @param output_vec_len : the length (int) of the output vector
     @param input_col      : the name (string) of the input column
     @param output_col     : the name (string) of the output column
     @return output dataframe with output column
     """
     # ensure that the input column is of type StringType()
     toArray = udf(lambda vs: vs, ArrayType(StringType()))
     toArray1 = udf(lambda vs: vs.toArray())
     df = input_df.withColumn(self.input_col,
                              toArray(input_df[self.input_col]))
     # initialize word2vec
     word2Vec = Word2Vec(vectorSize=output_vec_len,
                         windowSize=window_size,
                         minCount=5,
                         inputCol=self.input_col,
                         outputCol=self.output_col)
     # train word2vec model
     model = word2Vec.fit(df)
     # compute transformation
     result = model.transform(df)
     # convert result to a vector
     if not sub_test:
         conv = udf(lambda vs: Vectors.dense(vs), VectorUDT())
         out = result.withColumn(output_col, conv(result[output_col]))
         return out
     else:
         return result
Example #10
0
def frequency_vector_DataFrame(trainDF, cluster_count):
    regTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-z]")
    dfTokenizer = regTokenizer.transform(trainDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    df_remover = remover.transform(dfTokenizer)

    # feature extraction using Word2vec
    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec")
    vectors = word2Vec.fit(df_remover).getVectors()
    vectors_DF = vectors.select(vectors.word, vectors.vector.alias("features"))

    #  DF as kmeans
    kmeans = KMeans().setK(cluster_count).setSeed(1)
    km_model = kmeans.fit(vectors_DF)

    # Broadcast operation after getting the words and predictions
    vocabDF = km_model.transform(vectors_DF).select("word", "prediction")
    vocabDict = dict(vocabDF.rdd.collect())
    vocab_dict = sc.broadcast(vocabDict)

    # Cluster vector is in RDD form
    reviewsDF = df_remover.select(df_remover.filtered, df_remover.label).rdd
    clusterVectorRdd = reviewsDF.map(partial(word_to_cluster, vocab_dict=vocab_dict))


    cluster_frequency_feature_Rdd = clusterVectorRdd.map(partial(cluster_frequency_vector, cluster_count=cluster_count))

    cluster_freqDF = cluster_frequency_feature_Rdd.map(lambda (x, y): Row(x, y)).toDF()
    cluster_freq_featureDF = cluster_freqDF.select(cluster_freqDF._1.alias("features"), cluster_freqDF._2.alias("label"))

    return cluster_freq_featureDF
Example #11
0
def trainData():
    #rdd = sc.parallelize(rdd)
    #rdd.foreach(print)
    #rdd = sc.textFile("/ccga/SentimentAnalysisDataset.csv")
    '''#################################################TRAINING DATA SET#################################################'''
    rddTrain = sc.textFile("/ccga/set100k.csv")
    r = rddTrain.mapPartitions(lambda x: csv.reader(x))
    parts = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1])))
    spark = getSparkSessionInstance(rddTrain.context.getConf())
    partsDF = spark.createDataFrame(parts)
    #partsDF.show(truncate=False)
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    tokenized = tokenizer.transform(partsDF)
    #okenized.show(truncate=False)
    remover = StopWordsRemover(inputCol="words", outputCol="base_words")
    base_words = remover.transform(tokenized)
    #base_words.show(truncate=False)
    train_data_raw = base_words.select("base_words", "label")
    #train_data_raw.show(truncate=False)
    #base_words = train_data_raw.select("base_words")
    #base_words_rdd = base_words.rdd
    #print(base_words_rdd.collect())
    #base_words_map = base_words_rdd.flatMap(lambda x: x[0])
    #base_words_rdd.collect()
    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features")
    model = word2Vec.fit(train_data_raw)
    final_train_data = model.transform(train_data_raw)
    #final_train_data.show()
    final_train_data = final_train_data.select("label", "features")
    #final_train_data.show(truncate=False)
    lr = LogisticRegression(maxIter=1000, regParam=0.001, elasticNetParam=0.0001)
    lrModel = lr.fit(final_train_data)
    trained = lrModel.transform(final_train_data)
    return lrModel
    '''#################################################TRAINING DATA SET#################################################'''
Example #12
0
def train_word2vec(data):

   word2Vec = Word2Vec(vectorSize=embedded_size, minCount=0, inputCol="comment", outputCol="result")
   model = word2Vec.fit(data)
#   model.save(sc,"train_results/word2vec.train")

   return model
Example #13
0
def learn_with(dataset=None, save=True):
    [data, target_multi, target_single] = dataset

    # 利用word2vec构建词向量, 词向量长度100
    documentDF = spark.createDataFrame(
        [(data[i].split(" "), emoji_id_mapper[target_single[i]])
         for i in range(len(data))], ["text", "label"])
    word2Vec = Word2Vec(vectorSize=100,
                        minCount=0,
                        inputCol="text",
                        outputCol="features")
    model = word2Vec.fit(documentDF)
    result = model.transform(documentDF)

    result.select("label", "features").show()

    # train & test data
    (trainingData, testData) = result.randomSplit([0.8, 0.2], seed=100)
    print("Training Dataset Count: " + str(trainingData.count()))
    print("Test Dataset Count: " + str(testData.count()))

    # 建立 LR 模型
    lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
    lrModel = lr.fit(trainingData)
    predictions = lrModel.transform(testData)
    predictions.select("features", "label", "prediction") \
        .show(n=30, truncate=30)

    if save:
        lrModel.save('trained_models/')
Example #14
0
def main():

    data_schema = types.StructType([
        types.StructField('title', types.StringType()),
        types.StructField('text', types.StringType()),
        types.StructField('label', types.IntegerType())
    ])

    datadf = spark.read.csv("s3://projfakenews/ProcessedDatawithoutStemming",
                            schema=data_schema)

    datadf.show()

    word2Vec = Word2Vec(vectorSize=100,
                        minCount=0,
                        inputCol="text",
                        outputCol="features")
    datadf = datadf.withColumn("text", functions.array("text"))
    model = word2Vec.fit(datadf)
    result = model.transform(datadf)

    result.show()

    temp = result.select("features").show(1)

    result = result.dropna()
    result = result.randomSplit([0.8, 0.2], 24)
    print(result[0].count(), result[1].count())
    make_model(result[0], result[1])
Example #15
0
def main(*args):
    if len(args) != 2:
        print("Please provide both input and output directories!")
        sys.exit(1)

    input_fn, output_fn = args[0], args[1]
    conf = SparkConf()
    conf.setAppName("Word2Vec")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    # create the post table contains the tags info as a string
    posts = sc.textFile(input_fn)
    df_post = ((posts.map(lambda line: line.strip()).filter(
        lambda line: line.startswith('<row')).filter(
            lambda line: '/>' in line).map(
                Post.parse).map(lambda x: (x.owneruserid, x.tags)).toDF(
                    ['ownerid', 'tags'])))
    # parse the tags using the generic functions into a list of words
    df_tags = (df_post.withColumn(
        'input', F.regexp_replace(F.col('tags'), '<', '')).withColumn(
            'input',
            F.lower(F.col('input'))).withColumn('input',
                                                F.split(F.col('input'), '>')))
    # build the machine learning pipeline
    w2v = Word2Vec(inputCol="input",
                   outputCol="vectors",
                   vectorSize=100,
                   minCount=10,
                   seed=42)
    model = w2v.fit(df_tags)
    result = model.transform(df_tags)
    (model.findSynonyms(
        "ggplot2",
        25).rdd.map(lambda x: (x[0], x[1])).saveAsTextFile(output_fn))
def get_pipeline(vector_size=50, class_num=5, stopwords=None):
    '''
	构建pipeline
        该demo pipeline包含以下步骤:
	1. labelIndexer 将标签索引,从字符装化为整数
        2. tokenizer 将句子分成单词
        3. remover 去除停用词
        4. word2vec 使用word2vec将文本转化为低维度向量
        5. mpc 神经网络分类器
    '''
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    tokenizer = Tokenizer(inputCol="text", outputCol="raw_words")
    remover = StopWordsRemover(inputCol="raw_words",
                               outputCol="words",
                               stopWords=stopwords)
    word2vec = Word2Vec(vectorSize=vector_size,
                        minCount=2,
                        inputCol="words",
                        outputCol="vector")
    layers = [vector_size, (vector_size + class_num) / 2, class_num]
    mpc = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         seed=1234,
                                         featuresCol="vector",
                                         labelCol="indexLabel")
    pipeline = Pipeline(
        stages=[labelIndexer, tokenizer, remover, word2vec, mpc])
    return pipeline
def textstages(inputCol='stemmed'):
    #TF-IDF is a bag-of-words function, which gives higher weight to words that appear
    #frequently on a document but not frequently in all of the documents. The output is a features column.
    tf = HashingTF(inputCol=inputCol, outputCol='rawFeatures', numFeatures=500)
    idf = IDF(inputCol='rawFeatures', outputCol='features', minDocFreq=2.0)
    #idf = IDF(inputCol='rawFeatures', outputCol='features', minDocFreq=2.0)

    #Word2Vec is a Word Embedding function, which represents each word as a vector,
    #with words with similar meanings having neighboring vectors. The output is a feature column.
    word2vec = Word2Vec(vectorSize=300,
                        inputCol=inputCol,
                        outputCol='features')

    #Document Assembler to get Annotators (data type used by spark-NLP)
    docas = DocumentAssembler().setInputCol('joinedLem').setOutputCol(
        'document')
    tok = Tokenizer().setInputCols(['document']).setOutputCol('token')
    #add BERT class
    bert = BertEmbeddings.pretrained('bert_base_uncased', 'en').setInputCols(
        ['document', 'token']).setOutputCol('bertFeatures')
    embfin = sparknlp.EmbeddingsFinisher()\
        .setInputCols('bertFeatures')\
        .setOutputCols('finfeatures')\
        .setOutputAsVector(True)
    embfinfin = EmbeddingsFinisherFinisher(inputCol='finfeatures',
                                           outputCol='features')
    return [[tf, idf], [word2vec], [docas, tok, bert, embfin, embfinfin]]
Example #18
0
def build_pipeline(classifier='rf', max_depth=7):
    """
	creates a pipeline of functionalities to be applied on the training set
	"""

    # Training: Tokenize, Removing stop words, calculating n-grams, calcuating frequencies
    tokenizer = RegexTokenizer(inputCol="text",
                               outputCol="words",
                               pattern='\w{8}|\s')
    remover = StopWordsRemover(inputCol='words',
                               outputCol='filtered',
                               stopWords=['??'])
    ngram_2 = NGram(n=2, inputCol='filtered', outputCol='ngrams')
    ngram_3 = NGram(n=3, inputCol='filtered', outputCol='ngrams')
    hashingTF = HashingTF(inputCol="ngrams", outputCol="features")
    word2vec = Word2Vec(inputCol='ngrams', outputCol='features')

    if classifier == 'rf':
        clf = RandomForestClassifier(maxDepth=max_depth)
        stages = [tokenizer, remover, ngram_2, hashingTF, clf]
    elif classifier == 'nb':
        clf = NaiveBayes(smoothing=1)
        stages = [tokenizer, remover, ngram_3, hashingTF, clf]
    elif classifier == 'lr':
        clf = LogisticRegression()
        stages = [tokenizer, remover, ngram_2, word2vec, clf]
    else:
        raise ValueError("classifier must be 'rf', 'nb', or 'lr'.")
    return stages
def main(sc, spark):
    # Load the Corpus
    corpus = load_corpus(sc, spark)

    # Create the vector/cluster pipeline
    pipeline = Pipeline(stages=[
        Tokenizer(inputCol="text", outputCol="tokens"),
        Word2Vec(vectorSize=7, minCount=0, inputCol="tokens",
                 outputCol="vecs"),
        BisectingKMeans(k=10, featuresCol="vecs", maxIter=10),
    ])

    # Fit the model
    model = pipeline.fit(corpus)
    corpus = model.transform(corpus)

    # Evaluate clustering.
    bkm = model.stages[-1]
    cost = bkm.computeCost(corpus)
    sizes = bkm.summary.clusterSizes

    # TODO: compute cost of each cluster individually

    # Get the text representation of each cluster.
    wvec = model.stages[-2]
    table = [["Cluster", "Size", "Terms"]]
    for ci, c in enumerate(bkm.clusterCenters()):
        ct = wvec.findSynonyms(c, 7)
        size = sizes[ci]
        terms = " ".join([row.word for row in ct.take(7)])
        table.append([ci, size, terms])

    # Print Results
    print(tabulate(table))
    print("Sum of square distance to center: {:0.3f}".format(cost))
Example #20
0
def main(train_data, test_data, sc, sqlContext, output):
    text = sqlContext.read.json(train_data)

    train_df = text.select(text.reviewText, text.overall.alias("label"))

    # regextokenizer to split the words
    regexTokenizer = RegexTokenizer(inputCol="reviewText",
                                    outputCol="words",
                                    pattern="\\W|[0-9]")
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")

    #word2vec for representing as vectors
    word2Vec = Word2Vec(vectorSize=3,
                        minCount=0,
                        inputCol="filtered",
                        outputCol="features")

    lr = LinearRegression(maxIter=20, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[regexTokenizer, remover, word2Vec, lr])

    paramGrid = (ParamGridBuilder().addGrid(
        lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).build())

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(),
                              numFolds=5)  # 5 fold cross validation

    cv_model = crossval.fit(train_df)

    # Training data evaluation
    train_prediction = cv_model.transform(train_df)
    print train_prediction.show()
    train_evaluator = RegressionEvaluator(metricName="rmse",
                                          labelCol="label",
                                          predictionCol="prediction")
    train_rmse = train_evaluator.evaluate(train_prediction)

    text_test = sqlContext.read.json(test_data)
    test_df = text_test.select(text_test.reviewText,
                               text_test.overall.alias("label"))

    # Testing data evaluation
    test_prediction = cv_model.transform(test_df)
    print test_prediction.show()
    test_evaluator = RegressionEvaluator(metricName="rmse",
                                         labelCol="label",
                                         predictionCol="prediction")
    test_rmse = test_evaluator.evaluate(test_prediction)

    print("Training Root mean square error = " + str(train_rmse))
    print("Testing Root mean square error = " + str(test_rmse))

    #output writen to file
    out_file = open(output, 'w')
    out_file.write(str(train_rmse))
    out_file.write(str(test_rmse))
    out_file.close()
def sentimentAnalyze():
    #rdd = sc.textFile("/ccga/SentimentAnalysisDataset.csv")
    '''#################################################FIRST DATA SET#################################################'''
    rdd = sc.textFile("/ccga/SentimentTrain60k.csv")
    r = rdd.mapPartitions(lambda x: csv.reader(x))
    parts = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1])))
    spark = getSparkSessionInstance(rdd.context.getConf())
    partsDF = spark.createDataFrame(parts)
    #partsDF.show(truncate=False)
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    tokenized = tokenizer.transform(partsDF)
    #okenized.show(truncate=False)
    remover = StopWordsRemover(inputCol="words", outputCol="base_words")
    base_words = remover.transform(tokenized)
    #base_words.show(truncate=False)
    train_data_raw = base_words.select("base_words", "label")
    #train_data_raw.show(truncate=False)
    #base_words = train_data_raw.select("base_words")
    #base_words_rdd = base_words.rdd
    #print(base_words_rdd.collect())
    #base_words_map = base_words_rdd.flatMap(lambda x: x[0])
    #base_words_rdd.collect()
    word2Vec = Word2Vec(vectorSize=3,
                        minCount=0,
                        inputCol="base_words",
                        outputCol="features")
    model = word2Vec.fit(train_data_raw)
    final_train_data = model.transform(train_data_raw)
    #final_train_data.show()
    final_train_data = final_train_data.select("label", "features")
    #final_train_data.show(truncate=False)
    lr = LogisticRegression(maxIter=1000,
                            regParam=0.001,
                            elasticNetParam=0.0001)
    lrModel = lr.fit(final_train_data)
    trained = lrModel.transform(final_train_data)
    '''#################################################FIRST DATA SET#################################################'''
    '''#################################################SECOND DATA SET#################################################'''
    rdd2 = sc.textFile("/ccga/SentimentTest40k.csv")
    r2 = rdd2.mapPartitions(lambda x: csv.reader(x))
    parts2 = r2.map(lambda x: Row(
        sentence=str.strip(x[3]), label=int(x[1]), tweet=str(x[3])))
    spark2 = getSparkSessionInstance(rdd2.context.getConf())
    partsDF2 = spark2.createDataFrame(parts2)
    tokenizer2 = Tokenizer(inputCol="sentence", outputCol="words")
    tokenized2 = tokenizer2.transform(partsDF2)
    remover2 = StopWordsRemover(inputCol="words", outputCol="base_words")
    base_words = remover2.transform(tokenized2)
    train_data_raw2 = base_words.select("base_words", "label", "tweet")
    final_train_data2 = model.transform(train_data_raw2)
    final_train_data2 = final_train_data2.select("label", "features", "tweet")
    predict = lrModel.transform(final_train_data2)
    trained.show()
    predict.show()
    '''#################################################SECOND DATA SET#################################################'''
    print(
        "-------------------------------------------Working perfect-------------------------------------------"
    )
Example #22
0
def p1(time, rdd):

    rdd = rdd.map(lambda x: json.loads(x[1])).map(lambda x: x['text']).map(
        lambda x: x.upper())
    #rdd=rdd.map(lambda x:x.upper()).filter(lambda tweet:tweet!="HTTP" and tweet!="/" and tweet!="RT" and tweet!="@")

    rdd_MAGA = rdd.filter(lambda x: "MAGA" in x).map(lambda x: [x, "MAGA"])
    rdd_DICTATOR = rdd.filter(lambda x: "DICTATOR" in x).map(
        lambda x: [x, "DICTATOR"])
    rdd_IMPEACH = rdd.filter(lambda x: "IMPEACH" in x).map(
        lambda x: [x, "IMPEACH"])
    rdd_DRAIN = rdd.filter(lambda x: "DRAIN" in x).map(lambda x: [x, "DRAIN"])
    rdd_SWAMP = rdd.filter(lambda x: "SWAMP" in x).map(lambda x: [x, "SWAMP"])
    rdd_COMEY = rdd.filter(lambda x: "COMEY" in x).map(lambda x: [x, "COMEY"])

    rdd1 = sc.union(
        [rdd_MAGA, rdd_DICTATOR, rdd_IMPEACH, rdd_DRAIN, rdd_SWAMP, rdd_COMEY])

    parts = rdd1.map(lambda x: Row(sentence=x[0], label=x[1], date=time))
    spark = getSparkSessionInstance(rdd.context.getConf())
    partsDF = spark.createDataFrame(parts)
    #partsDF.show(truncate=False)

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    tokenized = tokenizer.transform(partsDF)
    #tokenized.show(truncate=False)

    remover = StopWordsRemover(inputCol="words", outputCol="base_words")
    base_words = remover.transform(tokenized)
    #base_words.show(truncate=False)

    train_data_raw = base_words.select("base_words", "label", "date")
    #train_data_raw.show(truncate=False)

    base_words = train_data_raw.select("base_words")
    #base_words.show(truncate=False)

    #Vectorize
    word2Vec = Word2Vec(vectorSize=3,
                        minCount=0,
                        inputCol="base_words",
                        outputCol="features")
    model = word2Vec.fit(train_data_raw)
    final_train_data3 = model.transform(train_data_raw)
    #final_train_data3.show()
    final_train_data3 = final_train_data3.select("label", "features", "date")
    #final_train_data3.show(truncate=False)

    final_model = lrModel.transform(final_train_data3)
    final_model.show()

    sentimentDataFrame = final_model.select("label", "date", "prediction")
    sentimentDataFrame.createOrReplaceTempView("sentimental")
    sentimentDataFrame = spark.sql(
        "select label, date, prediction, count(*) as total_label from sentimental group by label, date, prediction order by label"
    )
    sentimentDataFrame.show()
    sentimentDataFrame.write.mode("append").saveAsTable("sentiment1")
Example #23
0
    def main(self):
        stop_words = []
        # prod
        dataframe = self.read_dataframe(self.path, self.days_list).persist()

        # read approved user list
        # df = self.spark.read.csv(
        #     "hdfs:///ssymmetry_db/raw_db/sina_user_tag/sina_user_tag_item/weibo_uid_with_user_tag.csv")\
        #     .select("uid", "user_tag")

        # local test
        # dataframe = self.spark.read.json("sina_weibo_fans_data_2017-11-09-10-18.json")

        blog_rdd = self.read_blog_data(dataframe).fillna(" ").rdd

        def preprocess_data(x):
            uid = x["uid"]
            blog_content = x["blog_content"]
            forward_content = x["forward_content"]
            if forward_content.rfind(u"*****") > 0:
                forward_content = forward_content.split(u"*****")[1]
            return (uid, blog_content + forward_content)

        data = blog_rdd.map(preprocess_data).reduceByKey(
            lambda x, y: x + y).map(
                lambda x: [" ".join(jieba.cut(x[1])).split(" ")])

        sql_context = SQLContext(sparkContext=self.spark.sparkContext)
        word_df = sql_context.createDataFrame(data, ["values"])

        w2vec = Word2Vec(vectorSize=128, inputCol="values")
        model = w2vec.fit(word_df)

        def creat_dictionary(model):
            w_df = model.getVectors()
            w_df.show()
            data = w_df.rdd.collect()
            w2index = {}
            w2vec = {}
            i = 1
            for row in data:
                word = row.word
                vector = row.vector
                w2index[word] = i
                w2vec[word] = vector
                i += 1
            return w2index, w2vec

        # 把word2vec的词向量写出到一个pickle文件中
        index_dict, word_vectors = creat_dictionary(model)
        # out = open("w2vec.pkl", "wb")
        out = open("/udisk2/hxk/w2vec/w2vec.pkl", "wb")
        pickle.dump(index_dict, out)  # 索引字典
        pickle.dump(word_vectors, out)  # 词向量字典
        out.close()

        # test
        model.findSynonyms("你", 3).show()
Example #24
0
    def get_word_vec(self):
        data = self.merge_df.groupBy('user_id').agg(
            func.sort_array(func.collect_list(func.struct(func.col('time'), func.col('ad_id'))), asc=True).alias(
                'items'))
        data = data.withColumn("items", func.udf(lambda x: [i[1] for i in x], ArrayType(StringType()))('items'))

        word2Vec = Word2Vec(vectorSize=128, minCount=10, inputCol="items", outputCol="result")
        model = word2Vec.fit(data.repartition(1000))
        return model
Example #25
0
    def train(self):

        self.__prepare()

        spark = SparkSession\
            .builder\
            .appName("Kursach")\
            .getOrCreate()

        input_file = spark.sparkContext.textFile('./w2v.txt')

        # print(input_file.collect())
        prepared = input_file.map(lambda x: ([x]))
        df = prepared.toDF()
        prepared_df = df.selectExpr('_1 as text')

        tokenizer = Tokenizer(inputCol='text', outputCol='words')
        words = tokenizer.transform(prepared_df)

        stop_words = StopWordsRemover.loadDefaultStopWords('russian')
        remover = StopWordsRemover(inputCol='words',
                                   outputCol='filtered',
                                   stopWords=stop_words)
        filtered = remover.transform(words)

        # print(stop_words)

        # filtered.show()

        # words.select('words').show(truncate=False, vertical=True)

        # filtered.select('filtered').show(truncate=False, vertical=True)

        vectorizer = CountVectorizer(inputCol='filtered',
                                     outputCol='raw_features').fit(filtered)
        featurized_data = vectorizer.transform(filtered)
        featurized_data.cache()
        vocabulary = vectorizer.vocabulary

        # featurized_data.show()

        # featurized_data.select('raw_features').show(truncate=False, vertical=True)

        # print(vocabulary)

        idf = IDF(inputCol='raw_features', outputCol='features')
        idf_model = idf.fit(featurized_data)
        rescaled_data = idf_model.transform(featurized_data)

        self.__word2Vec = Word2Vec(vectorSize=3,
                                   minCount=0,
                                   inputCol='words',
                                   outputCol='result')
        self.__model = self.__word2Vec.fit(filtered)
        w2v_df = self.__model.transform(words)
        w2v_df.show()
        spark.stop()
def word2vec(words):
    word2Vec = Word2Vec(vectorSize=300,
                        minCount=2,
                        seed=42,
                        inputCol="words",
                        outputCol="features")
    model = word2Vec.fit(words)
    w2v = model.transform(words)
    return w2v
def method(traindata, testdata, model):
    regexTokenizer = RegexTokenizer(inputCol="text",
                                    outputCol="words",
                                    pattern="\\W")
    add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the"]

    stopwordsRemover = StopWordsRemover(
        inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

    label_stringIdx = StringIndexer(inputCol="airline_sentiment",
                                    outputCol="label")
    if (model == "Count"):
        x = CountVectorizer(inputCol="filtered",
                            outputCol="features",
                            vocabSize=10000,
                            minDF=5)

    elif (model == "Word2Vec"):
        x = Word2Vec(vectorSize=1000,
                     minCount=5,
                     inputCol="filtered",
                     outputCol="features")

    else:
        hashingTF = HashingTF(inputCol="filtered",
                              outputCol="rawFeatures",
                              numFeatures=10000)
        x = IDF(inputCol="rawFeatures", outputCol="features",
                minDocFreq=5)  #minDocFreq: remove sparse terms

    lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

    if (model == "TFIDF"):
        pipeline = Pipeline(stages=[
            regexTokenizer, stopwordsRemover, label_stringIdx, hashingTF, x, lr
        ])
    else:
        pipeline = Pipeline(
            stages=[regexTokenizer, stopwordsRemover, label_stringIdx, x, lr])

    pipelineFit = pipeline.fit(traindata)
    predictions = pipelineFit.transform(testdata)

    predictions.filter(predictions['prediction'] == 0).select(
        "text", "airline_sentiment", "probability", "label",
        "prediction").orderBy("probability", ascending=False).show(n=10,
                                                                   truncate=30)
    predictions.filter(predictions['prediction'] == 1).select(
        "text", "airline_sentiment", "probability", "label",
        "prediction").orderBy("probability", ascending=False).show(n=10,
                                                                   truncate=30)
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                              labelCol="label")
    print("F1: %g" % (evaluator.evaluate(predictions)))
    c = "logreg" + model + ".model"
    pipelineFit.save(c)
Example #28
0
def make_places_model():
    spark = SparkSession \
        .builder \
        .appName("SimpleApplication") \
        .getOrCreate()

    data = read_data_for_mode('forPlacesModel')

    persons = []
    text = []

    for person in data:
        NAME = person[0].split(' ')
        persons.append('_'.join(NAME))
        text += person[1]

    def remove_punctuation(text):
        return re.sub(r'[^\w\s^-]', '', text)

    for i in range(len(text)):
        text[i] = remove_punctuation(text[i])

    def remove_stop_words(text: list):
        ru_stop = stopwords.words('russian')
        tokens = []
        for line in text:
            line_tokenized = line.split()
            for token in line_tokenized:
                if not token.lower() in ru_stop:
                    tokens.append(token)
        return tokens

    tokens = remove_stop_words(text)

    def get_only_words(tokens):
        return list(filter(lambda x: re.match('[а-яА-Я]+', x), tokens))

    tokens = get_only_words(tokens)

    full_text = ' '.join(tokens)

    documentDF = spark.createDataFrame([(full_text.split(" "), )], ["text"])

    model = Word2Vec(vectorSize=3,
                     minCount=0,
                     inputCol="text",
                     outputCol="result")
    model_fitted = model.fit(documentDF)
    model_transformed = model_fitted.transform(documentDF)

    model.save('/home/pok/sem/project/models/places/model0mincount/model')
    model_fitted.save(
        '/home/pok/sem/project/models/places/model0mincount/fitted')
    # model_transformed.save('/home/pok/sem/project/models/model0mincount/transformed')

    spark.stop()
    def overlappingNgramWord2VecEncode(self, n = None, windowSize = None, vectorSize = None, fileName = None, sc = None):
        '''
        Encodes a protein sequence by converting it into n-grams and
        then transforming it into a Word2Vec feature vector.

        If given word2Vec file name, then this function encodes a protein
        sequence by converting it into n-grams and then transforming it using
        pre-trained word2Vec model read from that file

        Attribute:
            n (int): The number of words in an n-gram
            windowSize (int): width of the window used to slide across the \
                              squence, context words from [-window,window]
            vectorSize (int): dimension of the feature vector
            fileName (str): filename of Word2Vec model

        Returns:
            dataset with features vector added to original dataset
        '''

        # Create n-grams out of the sequence
        # E.g., 2-gram IDCGH, ... =>[ID, DC, CG, GH, ...]

        data = sequenceNgrammer.ngram(self.data, n, "ngram")

        if not (n == None and windowSize == None and vectorSize == None):
            # Convert n-grams to W2V freature vector
            # [ID, DC, CG, GH, ...] => [0.1234, 0.2394, ...]
            word2Vec = Word2Vec()
            word2Vec.setInputCol("ngram") \
                    .setOutputCol(self.outputCol) \
                    .setNumPartitions(8) \
                    .setWindowSize(windowSize) \
                    .setVectorSize(vectorSize) \

            self.model = word2Vec.fit(data)

        elif fileName != None and sc != None:
            reader = Word2VecModel()

            self.model = reader.load(sc, fileName)

            print(f"model file : {fileName} \n \
                    inputCol : {self.model.getInputCol()} \n \
                    windowSize : {self.model.getWindowSize()} \n \
                    vectorSize : {self.model.getVectorSize()}")

            self.model.setOutputCol(self.outputCol)

        else:
            raise Exception("Either provide word2Vec file (filename) + SparkContext (sc), \
                            or number of words(n) + window size(windowSize) \
                            + vector size (vetorSize), for function parameters")
            return

        return self.model.transform(data)
Example #30
0
def main():
    conf = SparkConf().setAppName('Sentiment Analysis_Word2Vec')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    train_inputs = sys.argv[1]
    test_inputs = sys.argv[2]

    schema = StructType([
            StructField('reviewText', StringType(), False),
        StructField('overall', DoubleType(), False),
    ])

    
    read_json = sqlContext.read.json(train_inputs, schema)
    read_json.registerTempTable('read_json')
    lowercase = sqlContext.sql("""
    SELECT lower(reviewText) as reviewText, overall as label
    FROM read_json
    """)
    
    regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    
    word2vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="features")
    
    lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
    
    pipeline = Pipeline(stages=[regexTokenizer, remover, word2vec, lr])
    
    
    paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

    crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=5)

    cvModel = crossval.fit(lowercase)
        
    testDF = sqlContext.read.json(test_inputs, schema)
    testDF.registerTempTable('test_data')
    test_data = sqlContext.sql("""
    SELECT lower(reviewText) as reviewText, overall as label
    FROM test_data
    """)

    train_prediction = cvModel.transform(lowercase)
    test_prediction = cvModel.transform(test_data)
    evaluator = RegressionEvaluator()

    print "Training dataset RMSE error: %s" %str(evaluator.evaluate(train_prediction))
    print "Testing dataset RMSE: %s" %str(evaluator.evaluate(test_prediction))