def main(): [Xs, vects, DT, ind2obj] = loadPickle(sys.argv[1]) [inits, labels, centers] = loadPickle(sys.argv[2]) tweetPre = sys.argv[3] if len(sys.argv) > 4: outPickle = sys.argv[4] ### params move to argv ### eventID = 0 # Xinds = [0,1,2,3] Xinds = [0, 5] ww = 1 wp = 1 wl = 1 wo = 1 # ws = [ww,wp,wl,wo] ws = [ww, 1] selectTime = 1 # wt = 0.5 lambdaB = 0.5 Learn = (1, 10) # params = NodeParams(Xinds, ws, lambdaB, selectTime, wt, Learn) ### run ### rootNode = EventNode(Xs, DT, params, inits) rootNode.run() ########## pickle ####### if len(sys.argv) > 4: sys.stderr.write('saving pickle...\n') with open(outPickle, 'w') as f: pickle.dump([params, rootNode.descriptor], f) ########### print ####### sys.stderr.write('After pickle, printing...\n') if len(sys.argv) > 5: rootNode.printCluster(vects, ind2obj, tweetPre=tweetPre)
def main(): [resDocInd, tweetsObj, tweetsObjDedup, tweetsScore] = loadPickle(sys.argv[1]) [Xs, vects, DT, ind2obj] = loadPickle(sys.argv[2]) rootParams, rootNodeDescriptor = loadPickle(sys.argv[3]) topK = int(sys.argv[4]) # ent used kSummary = int(sys.argv[5]) # summary sentences i = int(sys.argv[6]) window = 5 t0 = time() Pw_zs = rootNodeDescriptor.Pw_zs Pe_z = Pw_zs[1][:, i] evocab = vects[5].get_feature_names() vocab = vects[0].get_feature_names() ent_ind, ents = getEntInd(evocab, Pe_z, topK) # ents: the order in which EN comes in print("entscore in " + str(time() - t0)) t0 = time() newsObj = [ind2obj[n] for n in resDocInd] XN, XEn, NEb, sentencesIn, sentencesInObj, ent_text_n = getNewsContext( newsObj, ent_ind, ents, vocab, window) print("get news Context in " + str(time() - t0)) print len(newsObj), len(sentencesIn), len(set(sentencesIn)) t0 = time() XT, XEt, TEb, tweetsIn, tweetsInObj, ent_text_t = getTweetContext( tweetsObjDedup, ent_ind, ents, vocab, window) print("get tweet Context in " + str(time() - t0)) print len(tweetsObjDedup), len(tweetsIn), len(set(tweetsIn)) t0 = time() newsScore = XN.dot(Pw_zs[0][:, i]) tweetsScore = XT.dot(Pw_zs[0][:, i]) print("init score in " + str(time() - t0)) t0 = time() NE_ = XN.dot(XEn.T) #.multiply(NEb) TE_ = XT.dot(XEt.T) #.multiply(TEb) NE, EN = normBypartite(NE_) TE, ET = normBypartite(TE_) print("graph constr in " + str(time() - t0)) t0 = time() nScore, tScore = triHits(newsScore, tweetsScore, NE, EN, TE, ET, 0.2, 0.2, 5) print("trihits in " + str(time() - t0)) t0 = time() printSummary(newsScore, tweetsScore, sentencesIn, sentencesInObj, tweetsIn, tweetsInObj, kSummary) print "*****" printSummary(nScore, tScore, sentencesIn, sentencesInObj, tweetsIn, tweetsInObj, kSummary)
def main(): [Xs,vects,DT,ind2obj] = loadPickle(sys.argv[1]) [inits,labels,centers] = loadPickle(sys.argv[2]) tweetPre=sys.argv[3] if len(sys.argv)>4: outPickle = sys.argv[4] ### params move to argv ### eventID=0 # Xinds = [0,1,2,3] Xinds = [0,5] ww = 1 wp=1 wl=1 wo=1 # ws = [ww,wp,wl,wo] ws = [ww,1] selectTime = 1 # wt = 0.5 lambdaB = 0.5 Learn=(1,10) # params = NodeParams(Xinds,ws,lambdaB,selectTime,wt,Learn) ### run ### rootNode = EventNode(Xs,DT,params,inits) rootNode.run() ########## pickle ####### if len(sys.argv)>4: sys.stderr.write('saving pickle...\n') with open(outPickle, 'w') as f: pickle.dump([params,rootNode.descriptor],f) ########### print ####### sys.stderr.write('After pickle, printing...\n') if len(sys.argv)>5: rootNode.printCluster(vects,ind2obj,tweetPre=tweetPre)
def _loadFuncs(self): self.log('Start files loading...') filesMap = self.filesMap for key in filesMap: filePath = filesMap[key] self.log(f'Try to load {key} file: {filePath}') func = None if filePath.endswith('.pickle'): grid = utils.loadPickle(filePath, 'rb') if grid is None: self.error( f'Unable to load {key} function as pickle:\n{filePath}.' ) func = num_methods.interpolation.SplineInterpolation(grid) elif filePath.endswith('.json'): jsn = utils.loadJson(filePath, 'r') if jsn is None: self.error( f'Unable to load {key} function interpolation as json:\n{filePath}.' ) func = num_methods.interpolation.SplineInterpolation( None).load_from_dict(jsn) else: grid = utils.loadCSV(filePath) if grid is None: self.error( f'Unable to load {key} function as csv:\n{filePath}.') func = num_methods.interpolation.SplineInterpolation(grid) self.funcDict[key] = func self.log('Successful') self.log('All files loaded')
def spectral_cluster(): t0 = time() S = spectral_clustering( loadPickle('./models/trump_sample_affinity.pickle'), n_clusters=100) savePickle(S, './models/trump_sample_spectral.pickle') print(S) print("Spectral clustering took {}s".format(time() - t0))
def kmeans(): t0 = time() K = k_means(loadPickle('./models/trump_sample_vectors.pickle'), n_clusters=100, n_jobs=-1) savePickle(K, './models/trump_sample_kmeans.pickle') print(K) print("K-means took {}s".format(time() - t0))
from utils import countLines import sys from warnings import warn import string import numpy as np from itertools import cycle from itertools import repeat # Try to load the word2vec model and the multilabelbinarizer w2vfile = './models/w2v' mlbfile = './models/mlb.pickle' w2v = False # Loading pickle files is faster, so check that one first if os.path.exists(w2vfile + '.pickle'): w2v = loadPickle(w2vfile + '.pickle') elif os.path.exists(w2vfile + '.bin'): w2v = loadWord2Vec(w2vfile + '.bin') else: warn( "{} not found, will not be able to sub or create word matrices".format( w2vfile)) if w2v: word_d = w2v.layer1_size prepare_mode = '-p' in sys.argv or '--prepare' in sys.argv or '-m' in sys.argv or '--make' in sys.argv if os.path.exists(mlbfile) and not prepare_mode: mlb = loadPickle(mlbfile) valid_hashtags = set(mlb.classes_)
def loadCoeff(self, path): self.coeff = loadPickle(path)
def loadLoc(self, path): self.loc = loadPickle(path) self.N = len(self.loc) self.width = 2 * int( np.abs(self.loc[:, :2]).max() + self.loc[:, 6].max() / 2.0)
import sys,os import pickle from eknot_utils import init_all,EventNode from utils import loadPickle if __name__ == "__main__": # input args: K tweetPre dataPickle outPickle mini [n_init init_size batch_size] K=int(sys.argv[1]) tweetPre=sys.argv[2] [Xs,vects,DT,ind2obj] = loadPickle(sys.argv[3]) outPickle = sys.argv[4] mini = int(sys.argv[5]) if mini: n_init = int(sys.argv[6]) init_size = int(sys.argv[7]) batch_size = int(sys.argv[8]) # inits sys.stderr.write("begin initiating... \n") if mini: inits,labels,centers = init_all(K,Xs,DT,mini,n_init,init_size,batch_size) else: inits,labels,centers = init_all(K,Xs,DT) # write if outPickle != 'null': with open(outPickle, 'w') as f: pickle.dump([inits,labels,centers],f) sys.stderr.write("Pickle saved. Begin printing... \n") ####################### print ####################### rootNode = EventNode(Xs,initsDescriptor=inits)
import sys, os import pickle from eknot_utils import nextData, weightX, subRun, EventNode, NodeParams from utils import loadPickle if __name__ == "__main__": # data_pickle eventNode_pickle tweetPre switchText eventID Kevent [outPickle] [_, vects, _, ind2obj] = loadPickle(sys.argv[1]) rootNode = loadPickle(sys.argv[2]) tweetPre = sys.argv[3] switch = sys.argv[4] eventID = int(sys.argv[5]) # sub event number K = int(sys.argv[6]) if len(sys.argv) > 7: outPickle = sys.argv[7] ######### sub ########### sys.stderr.write('Running sub...\n') n_wdxPz_wds, XsWeighted = nextData(rootNode) ## params numX = len(XsWeighted) Xinds = range(numX) # can be customized ws = [1 for i in Xinds] # can be customized selectTime = 0 # wt = 0.5 lambdaB = 0.5 Learn = (1, 10) ## params = NodeParams(Xinds, ws, lambdaB, selectTime, wt, Learn, eventID) eventNode = subRun(XsWeighted, n_wdxPz_wds, K, params, rootNode.DT, rootNode.dID)
from preprocess import text2mat from utils import loadPickle import numpy as np import os from warnings import warn from numpy.linalg import norm from utils import saveTweet2Vec from utils import loadTweet2Vec import matplotlib.pyplot as plt from keras.callbacks import ModelCheckpoint from keras.callbacks import CSVLogger from sklearn.metrics.pairwise import euclidean_distances mlb_file = './models/mlb.pickle' if os.path.exists(mlb_file): mlb = loadPickle(mlb_file) else: warn( "{} doesn't exist - need this to generate labels for training: run `./preprocess.py --prepare input.txt` first" ) class Tweet2Vec: def __init__(self, model=None, char=True, chrd=True, word=True, normalize=False): ''' Initialize stuff
from utils import loadPickle import numpy as np from sklearn.neighbors import KNeighborsClassifier from collections import Counter celebs, celeb_encodings = loadPickle( 'Show-Segmentation-2020/final_celeb_detection/final_pickles/anchors-with-TV-encodings.pickle' ) celeb_encodings = np.array([np.array(x) for x in celeb_encodings]) # Populating KNN space with labelled encodings X = [] Y = [] for i in range(len(celeb_encodings)): #prepare dataset for celeb_encoding in celeb_encodings[i]: X.append(celeb_encoding) Y.append(celebs[i]) neigh = KNeighborsClassifier(n_neighbors=30) neigh.fit(X, Y) def encoding2name(f_encodings): return neigh.predict(f_encodings) def findHostNames(shows, face_encodings): for show in shows: hosts = show.hosts.split('&') #getting list of hosts of the show hosts = sorted( hosts, key=lambda x: len(face_encodings[int(x)]), reverse=True) #Most occuring anchor is taken as the main anchor
import os import pandas as pd import matplotlib.pyplot as plt from plotly.offline import plot import plotly.express as px from sklearn.manifold import TSNE import seaborn as sns import utils as ut import configs as cf #%% Read text2vec pickle shape_run_id = '0209-0306' run_root_dir = os.path.join(cf.SHAPE_RUN_DIR, shape_run_id) shape2vec = ut.loadPickle(os.path.join(run_root_dir, "shape2vec.pkl")) shape2loss = ut.loadPickle(os.path.join(run_root_dir, "shape2loss.pkl")) bright = [ "#023EFF", "#FF7C00", "#1AC938", "#E8000B", "#8B2BE2", "#9F4800", "#F14CC1", "#A3A3A3", "#000099", "#00D7FF", "#222A2A" ] #%% Run TSNE on the latent vectors latent_dim = shape2vec.get('52255064fb4396f1b129901f80d24b7b').shape[0] latent_vects = np.zeros((len(shape2vec), latent_dim)) for i, key in enumerate(shape2vec.keys()): latent_vects[i, :] = shape2vec[key] perp, lr = 40, 200 tsne = TSNE(n_components=2,
from utils import loadPickle from smart_open import smart_open from preprocess import TweetHashtagIterator w2vfile = './models/w2v_1day.pickle' w2v = loadPickle(w2vfile) class TweetSubIterator(TweetHashtagIterator): def __init__(self, source): TweetHashtagIterator.__init__(self, source, 'tweet', True) def __iter__(self): pass def sub(tweet, thresh=.9): # TODO cache "most_similar" for speed? words = tweet.split() most_sims = [] for word in words: if word in w2v: most_sim = w2v.most_similar(word)[0] if most_sim[1] > thresh: most_sims.append(most_sim[0]) else: most_sims.append(word) else: most_sims.append(word) return ' '.join(most_sims)
import sys,os import pickle from eknot_utils import nextData,weightX,subRun,EventNode,NodeParams from utils import loadPickle if __name__ == "__main__": # data_pickle plsa_pickle tweetPre switchText eventID Kevent [outPickle] [Xs,vects,DT,ind2obj] = loadPickle(sys.argv[1]) rootParams,rootNodeDescriptor = loadPickle(sys.argv[2]) tweetPre = sys.argv[3] switch = sys.argv[4] eventID = int(sys.argv[5]) # event number K=int(sys.argv[6]) if len(sys.argv)>7: outPickle = sys.argv[7] rootNode = EventNode(Xs,params=rootParams,descriptor=rootNodeDescriptor) ######### sub ########### sys.stderr.write('Running sub...\n') n_wdxPz_wds,XsWeighted = nextData(rootNode) ## params numX = len(XsWeighted) Xinds = range(numX) # can be customized ws = [1 for i in Xinds] # can be customized selectTime = 0 # wt = 0.5 lambdaB = 0.5 Learn=(1,10) ## params = NodeParams(Xinds,ws,lambdaB,selectTime,wt,Learn,eventID)
import sys, os import pickle from eknot_utils import init_all, EventNode from utils import loadPickle if __name__ == "__main__": # input args: K tweetPre dataPickle outPickle mini [n_init init_size batch_size] K = int(sys.argv[1]) tweetPre = sys.argv[2] [Xs, vects, DT, ind2obj] = loadPickle(sys.argv[3]) outPickle = sys.argv[4] mini = int(sys.argv[5]) if mini: n_init = int(sys.argv[6]) init_size = int(sys.argv[7]) batch_size = int(sys.argv[8]) # inits sys.stderr.write("begin initiating... \n") if mini: inits, labels, centers = init_all(K, Xs, DT, mini, n_init, init_size, batch_size) else: inits, labels, centers = init_all(K, Xs, DT) # write if outPickle != 'null': with open(outPickle, 'w') as f: pickle.dump([inits, labels, centers], f) sys.stderr.write("Pickle saved. Begin printing... \n") ####################### print #######################
import sys from eknot_utils import EventNode from utils import loadPickle import pickle if __name__ == "__main__": # input args: data_pickle inits/plsa_pickle tweetPre switchtext i outpickle [Xs,vects,DT,ind2obj] = loadPickle(sys.argv[1]) picklename = sys.argv[2] tweetPre = sys.argv[3] switch = sys.argv[4] i = int(sys.argv[5]) outPickle = sys.argv[6] if 'inits_' in picklename: [inits,labels,centers] = loadPickle(picklename) rootNode = EventNode(Xs,initsDescriptor=inits) sys.stderr.write('Printing...\n') rootNode.printCluster(vects,ind2obj,tweetPre=tweetPre,switch=switch,fromPlsa=0) elif 'plsa_' in picklename: rootParams,rootNodeDescriptor = loadPickle(picklename) rootNode = EventNode(Xs,params=rootParams,descriptor=rootNodeDescriptor) sys.stderr.write('Printing...\n') resDocInd,tweetsObj,tweetsObjDedup,tweetsScore = rootNode.printCluster_i(vects, ind2obj,i,tweetPre=tweetPre,switch=switch,fromPlsa=1) sys.stderr.write('saving pickle...\n') with open(outPickle, 'w') as f: pickle.dump([resDocInd,tweetsObj,tweetsObjDedup,tweetsScore],f) else: sys.stderr.write("wrong plsa/inits pickle name\n") exit(-1)
def get_affinity(): t0 = time() A = rbf_kernel(loadPickle('./models/trump_sample_vectors.pickle')) savePickle(A, './models/trump_sample_affinity.pickle') print(A.shape) print("Spectral clustering took {}s".format(time() - t0))