Beispiel #1
0
    def __call__(self, *args, save=None, **kwargs):
        o = self.process(*args, **kwargs)
        self.assert_return_keys(o)

        if save is not None:
            with open(save, 'wb') as f:
                pickle.save(o, f)

        return o
Beispiel #2
0
def zapis_grup(grupy, par=None):
    """
    zapisuje binarny plik grupy pod nazwą i ścieżką par
    :param grupy: słownik z grupami
    :param par: ścieżka
    :return:
    """
    if par is None:
        par = ["grupy"]
    with open(par[0] + ".bin", 'wb') as plik:
        save(grupy, plik)
Beispiel #3
0
 def trainHMM( self, trainingFiles ):
     ''' Train the HMM '''
     self.hmm = HMM( self.labels, self.featureNames, self.contOrDisc, self.numFVals )
     allStrokes = []
     allLabels = []
     for f in trainingFiles:
         # print "Loading file", f, "for training"
         strokes, labels = self.loadLabeledFile( f )
         allStrokes.append(strokes)
         allLabels.append(labels)
     allObservations = [self.featurefy(s) for s in allStrokes]
     self.hmm.train(allObservations, allLabels)
     pickle=Picklefy()
     pickle.save(self.hmm,'hmmbasic.pickle')
Beispiel #4
0
def get_tid_to_filelength_dict():

    try:
        tid_to_filelength_dict = pickle.load(gzip.open(PATH_TOKENIZED +
                    'tid_to_filelength_dict.pickle', 'rb'))

    except IOError:
        print("Preprocessed tid_to_filelength_dict not available. Creating a new one.")

        # tid_to_filelength_dict = {}
        tid_to_filelength_arr = np.zeros(DOC_COUNT, dtype=np.int64)

        db = Database('TOB_FULL')
        con, cur = db.connect()

        cur.execute('SELECT tid from docs')
        count = 0
        while True:
            count += 1
            if count % 10000 == 0:
                print(count)
            row = cur.fetchone()
            if not row: break
            tid = row['tid']

            filepath = '{}{}/{}/{}/{}/{}'.format(PATH_OCR_FILES, tid[0], tid[1], tid[2], tid[3], tid + ".txt")

            array_len = 10000
            end = None
            while True:
                b = bytearray(array_len)
                f = io.open(filepath, 'rb')
                f.readinto(b)
                str = b.decode('cp1252', errors='ignore')
                end = str.find('\x00')
                if end > -1:
                    break
                else:
                    array_len *= 10

#            tid_to_filelength_dict[tid] = end
            tid_to_filelength_arr[tid] = end
#        pickle.dump(tid_to_filelength_dict, gzip.open(PATH_TOKENIZED + 'tid_to_filelength_dict.pickle', 'wb'))
        pickle.save(PATH_TOKENIZED + 'tid_to_filelength_arr.npy', tid_to_filelength_arr)
        print("Longest file is {} bytes long.".format(max(tid_to_filelength_dict.values())))


    # if the number of tids in the dict != DOC_COUNT, something is wrong
    assert len(tid_to_filelength_dict) == DOC_COUNT, "Length of tid_to_filelength_dict ({}) does not equal DOC_COUNT ({})".format(len(tid_to_filelength_dict), DOC_COUNT)
    return tid_to_filelength_dict
doOpt = False
doMCUncertainty = False
doLikelihoodDist = False

recalc = True
rewrite = True
if not recalc:
    closures = load(submitDir + '/closures_ABCD.p')
    twosigmas = load(submitDir + '/twosigmas_brazil_ABCD.p')
    if not doSysts: twosigmas = load(submitDir + '/twosigmas_brazil_ABCD.p')
    else: twosigmas = load(submitDir + '/twosigmas_brazil_systs_ABCD.p')
else:
    closures, twosigmas = doMLE(doSysts, doMCUncertainty, doLikelihoodDist)
    if rewrite:
        if not doSysts:
            save(twosigmas, open(submitDir + '/twosigmas_brazil_ABCD.p', 'wb'))
        else:
            save(twosigmas,
                 open(submitDir + '/twosigmas_brazil_systs_ABCD.p', 'wb'))
        #save(closures,open(submitDir+'/closures_ABCD.p','wb'))
        #save(twosigmas,open(submitDir+'/twosigmas_ABCD.p','wb'))
        #save(fivesigmas,open(submitDir+'/fivesigmas_ABCD.p','wb'))

doBrazil = True
if doBrazil:
    for v in VBFmasscuts:
        plt.errorbar(amasscuts, [twosigmas[a, v][0] for a in amasscuts],
                     label='VBF $M_{jj}$ cut = ' + str(v) + ' GeV',
                     color='black',
                     ls='--')
        plt.fill_between(amasscuts, [twosigmas[a, v][4] for a in amasscuts],
Beispiel #6
0
 def save_data(self, filename):
     pkl.save(filename)
Beispiel #7
0
def save_pickle(f, d):
    if f is not None:
        with open(f, 'wb') as file:
            pickle.save(d, file)
    else:
        raise ValueError
from tbselenium.tbdriver import TorBrowserDriver
import pickle
from bs4 import BeautifulSoup

tbpath = "tor-browser_en-US"

with open("oniontree-source.html", 'r') as f:
    data = f.read().replace('\n', '')

driver = TorBrowserDriver(tbpath)
# driver.load_url(website)

soup = BeautifulSoup(data, 'html.parser')

anchors = soup.find_all('a')
l = map(lambda x: x.get("href"), anchors)

potential_onions = []

for url in l:
    driver.load_url(url)
    e = driver.find_element_by_class_name("urls")
    onions = e.find_elements_by_tag_name("a")
    if len(onions) > 5:
        print("too many onions for {}. skipping".format(url))
    for o in onions:
        print(o.get_attribute('href'))
        potential_onions.append(o.get_attribute('href'))

pickle.save(potential_onions, 'onions.sav')
def save_coords(diction):
    diction = pickle.save("save.p", diction )
    return
def save_coords(diction):
    diction = pickle.save("save.p", diction)
    return
Beispiel #11
0
    print('The image width is', img_width)

    trn_gen, val_gen, tst_gen = ImageGenerator.get_generators(img_width)
    model, callbacks = UNetModel.get_unet_model(img_width)
    num_epochs = 10

    print('Training initialized\n')
    start = time.time()
    history = model.fit_generator(trn_gen,
                                  steps_per_epoch=2035,
                                  epochs=num_epochs,
                                  validation_data=val_gen,
                                  validation_steps=252,
                                  callbacks=callbacks)

    stop = time.time()
    print('Training complete\nSaving model')
    model.save('model.h5')
    pickle.save(history.history, open('history.p', 'wb'))

    trn_acc = history.history.get('dice_coef')
    val_acc = history.history.get('val_dice_coef')
    tst_acc = [
        model.evaluate_generator(tst_gen, steps=252)[1]
        for _ in range(num_epochs)
    ]

    print('Training Time:', stop - start, 'seconds')
    print('Average Training Accuracy: ', np.mean(trn_acc))
    print('Average Validation Accuracy: ', np.mean(val_acc))
    print('Average Testing Accuracy: ', np.mean(tst_acc))
Beispiel #12
0
"""
Libraries imports
External only
"""
import sys
import socket
import pickle
import getopt
"""
Function descriptions
"""


def ParseCMDArgs(args):
    pass


#Main func
parameters = sys.argv[1:]
parsedData = ParseCMDArgs(parameters)  #execPath,noReplica,debug
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

msg = pickle.save(parsedData)
recv_addr = ("127.0.0.1", 8000)
sock.send_to(msg, recv_addr)
_, data = sock.recvfrom(1024)

if data is -1:
    print("System has exhauted its quota of remote processes")
    exit(0)
	def save(self, filepath):
		with open(filepath, 'wb') as f:
			pickle.save(self.rects, f)
    'Society Culture', 'Science Mathematics', 'Health', 'Education Reference',
    'Computers Internet', 'Sports', 'Business Finance', 'Entertainment Music',
    'Family Relationships', 'Politics Government'
]

filename = './data/glove.840B.300d.w2vformat.txt'
model = gensim.models.KeyedVectors.load_word2vec_format(filename)
vector_size = model.vector_size
embedding_vectors = np.random.uniform(-0.001, 0.001,
                                      (len(wordtoix), vector_size))
glove_vocab = list(model.vocab.keys())
count = 0
mis_count = 0
for word in wordtoix.keys():
    idx = wordtoix.get(word)
    if word in glove_vocab:
        embedding_vectors[idx] = model.wv[word]
        count += 1
    else:
        mis_count += 1
print("num of vocab in glove: {}".format(count))
print("num of vocab not in glove: {}".format(mis_count))

# print("load class embedding")
# name_list = [ k.lower().split(' ') for k in class_name]
# id_list = [ [ wordtoidx[i] for i in l] for l in name_list]
# value_list = [ [ opt.W_emb[i] for i in l]    for l in id_list]
# value_mean = [ np.mean(l)  for l in id_list]

pickle.save(open('./data/yahoo_emb.p', 'wb'), [embedding_vectors, value_mean])
Beispiel #15
0
sys.path.append("src")
from fetch import *
from sparsify import *
from correlate import *

#Read training data from files and parse into sparse vectors
x,y = getData("small_tfidfvector_byhost_onlybody.csv.txt","webspam-uk2006-set1-labels.txt")

#Convert to gensim compatible corpus
corpus = gensim.matutils.Sparse2Corpus(x,False)

#derive topic model
lda = gensim.models.ldamodel.LdaModel(corpus,num_topics=100)

#save model to load while testing
lda.save('lda_model')

#derive topic proportions for training data
corpus_lda = lda[corpus]

#convert topic proportions to sparse representation
topDistSparse = sparsify(corpus_lda)

#find correlation between topics
corrTopics = correlate(topDistSparse)

#train and save classifier
clf = RandomForestClassifier(n_estimators=50)
clf.fit(corrTopics.toarray(),y)
pickle.save(clf,open("classifier","wb"))