def __init__(self, data_path, epsilon=1, seed=42): """ Initializes our object dataset. The data held by this dataset consists in state vectors for each object. Implementation notes : for now we'll yield vectors in batches of 1, with (object vectors, index) tuples. index refers to the configuration index that is used to compare two different set of objects (equal indices means equal configuartions). The configurations are stored as a list of (list of arrays, int) tuples, as is returned by ut.from_file(). Arguments : - data_path : path to the data file - epsilon (float between 0 and 1) : proportion, for one configuration, of similar configurations in the dataset. This leads to a epsilon**2 to one imbalance in the comparison dataset for the positive ('same') class. To overcome this, we undersample the negative class by dropping negative examples with a probability of 1 - epsilon**2 """ self._configs = ut.from_file(data_path) self._nb_objects = 3 self._seed = seed self.epsilon = epsilon np.random.seed(self._seed)
def word_use_by_year(self, word): data = utils.from_file('zipf_by_year.json') for x in data: dat = data[x] if dat['top_words'].count(word): index = str(dat['top_words'].index(word) + 1) else: index = 'No presente' print dat['name'] + ' ' + index
def new(self, input_file, number_of_blocks=None, block_size=None): self.string_filename = input_file self.string = utils.from_file(self.string_filename) self.tester = Tester(testnames) self.tester.run_all_tests(self.string, number_of_blocks, block_size) for test in self.testnames: self.p_values[test] = [r.p for r in self.tester.results[test]]
def word_use_by_year(self, word): data =utils.from_file('zipf_by_year.json') for x in data: dat = data[x] if dat['top_words'].count(word): index = str(dat['top_words'].index(word)+1) else: index = 'No presente' print dat['name']+' '+index
""" Find how many of the words in words.txt are triangle words. All the words are upper case. """ from utils import from_file, is_triangle values = {} for i in range(26): # ascii values of A - Z values[chr(i + 65)] = i + 1 content = from_file("data/words.txt") content = content.strip() words = [ word.strip("\"") for word in content.split(",")] tris = 0 for word in words: score = 0 for letter in word: score += values[letter] if is_triangle(score): tris += 1 print tris
### Test script to show off some of the functionality of codebase ### from time import time from utils import to_dic, from_file from cdd_domains import domain_manager if __name__ == '__main__': start = time() domains = from_file(domain_manager, 'FullSeaUrchin.txt' ) #.15 seconds for this, about 6 seconds for full file domains = to_dic(domains) print 'It took str(time()-start) seconds to read in str(len(domains)) sea urchin domains and assign unique\ dictionary keys' ### add some domains from scratch ### domains['fake entry'] = domain_manager.dict_make(Query='Fake Query') print domains['fake entry'] # strictnamedtuple('hi') # d0=DomainCDD() # d1=DomainCDD(Query='steve', Start=20.0) # d2=DomainCDD(Query='bo', Start=8.0) # d3=DomainCDD(Query='gay', Start=50.0) # print dir(d1), d1 # ds=(d1,d2,d3) # dic=ManagerCDD() # for d in ds: # dic[d.get_uniquekey()]=d # domains=from_file('TestSet.txt') # print len(domains) # dfile=to_dic(domains) # fd=formatted_domains(dfile) # a=network_diagram(fd, 'cl09099')
### Test script to show off some of the functionality of codebase ### from time import time from utils import to_dic, from_file from cdd_domains import domain_manager if __name__ == '__main__': start=time() domains=from_file(domain_manager, 'FullSeaUrchin.txt') #.15 seconds for this, about 6 seconds for full file domains=to_dic(domains) print 'It took str(time()-start) seconds to read in str(len(domains)) sea urchin domains and assign unique\ dictionary keys' ### add some domains from scratch ### domains['fake entry']=domain_manager.dict_make(Query='Fake Query') print domains['fake entry'] # strictnamedtuple('hi') # d0=DomainCDD() # d1=DomainCDD(Query='steve', Start=20.0) # d2=DomainCDD(Query='bo', Start=8.0) # d3=DomainCDD(Query='gay', Start=50.0) # print dir(d1), d1 # ds=(d1,d2,d3) # dic=ManagerCDD() # for d in ds: # dic[d.get_uniquekey()]=d # domains=from_file('TestSet.txt') # print len(domains) # dfile=to_dic(domains) # fd=formatted_domains(dfile) # a=network_diagram(fd, 'cl09099') # network_outfile(a, 'junknetwork', summary=True, adjacency=True) # Pixel = namedtuple('DomainCDD', DomainCDD._fields + 'FagField')
self.d2 = self.psi0 - 2*self.psi1 + self.psi2 self.p1 = sp.gammaincc(2**(self.m-2), self.d1/2) self.p2 = sp.gammaincc(2**(self.m-3), self.d2/2) self.p = self.p1 self.success = (self.p >= 0.01) self.test_run = True def psi_sq_mv1(self, m, padded_bits): counts = [0 for i in range(2**m)] for idx in range(self.n): block = padded_bits[idx:idx+m] block_as_int = int(''.join([str(i) for i in block]), 2) counts[block_as_int] += 1 psi_sq_m = sum([c**2 for c in counts])*(2**m)/self.n-self.n return psi_sq_m if __name__ == '__main__': import sys sys.path.append('../') import utils e = '../numbers/data.e' bits = utils.from_file(e)[:1000000] # bits = utils.from_string('0011011101') S = Serial(bits) print(S.p2) # p-value = 0.56195
if __name__ == '__main__': strictnamedtuple('hi') d0=DomainCDD() d1=DomainCDD(Query='steve', Start=20.0) d2=DomainCDD(Query='bo', Start=8.0) d3=DomainCDD(Query='gay', Start=50.0) print dir(d1), d1 ds=(d1,d2,d3) dic=ManagerCDD() for d in ds: dic[d.get_uniquekey()]=d # print dic # print sortbyarg(dic, 'Query', 'Start') # print get_subset(dic, 'Query', 'Start', newkey='Start') domains=from_file('TestSet.txt') print len(domains) dfile=to_dic(domains) fd=formatted_domains(dfile) a=network_diagram(fd, 'cl09099') network_outfile(a, 'junknetwork', summary=True, adjacency=True) Pixel = namedtuple('DomainCDD', DomainCDD._fields + 'FagField') p=Pixel() print p ################## ######SCRAP####### ##################
""" For each item in the list of names, get an alphabetical score. Then multiply it by its position in the list to get an overall score. """ from utils import from_file content = from_file("data/p022_names.txt") values = {} for i in range(26): # ascii values of A - Z values[chr(i + 65)] = i + 1 content = content.strip() names = [ word.strip("\"") for word in content.split(",") ] names = sorted(names) total = 0 for i in range(len(names)): score = sum( [values[x] for x in names[i] ] ) #score = reduce( lambda x, y: values[x] + values[y], names[i] ) total += score * (i + 1) print total