def sequenceReconstruction(self, org, seqs): # edges: {} key: from, val: to list # indegrees: {} key: node, val: cnt of coming # nodes: set total set of node edges = defaultdict(list) indegrees = defaultdict(int) nodes = set() for seq in seqs: # note that seq is a list of int not a pair # merge all int in seq to nodes. nodes |= set(seq) for i in range(len(seq)): if i == 0: indegrees[seq[i]] += 0 if i < len(seq) - 1: edges[seq[i]].append(seq[i + 1]) indegrees[seq[i + 1]] += 1 cur = [k for k in indegrees if indegrees[k] == 0] res = [] while len(cur) == 1: cur_node = cur.pop() res.append(cur_node) # use defaultdict to avoid edges[cur_node] not populated for node in edges[cur_node]: indegrees[node] -= 1 if indegrees[node] == 0: cur.append(node) if len(cur) > 1: return False # len(res) == len(nodes) ensure all nodes considered return len(res) == len(nodes) and res == org
def nreport(neighbors): NN, NumN = defaultdict(int), defaultdict(int) ## Nearest, Number of neighbors for n in neighbors: nn = min(neighbors[n].values() or ['>25']) NN[nn] += 1 for d2 in neighbors[n].values(): NumN[d2] += 1 print print "Nearest neighbor counts:", showh(NN) print "Number of neighbors at each distance:", showh(NumN)
def histo(items): "Make a histogram from a sequence of items or (item, count) tuples." D = defaultdict(int) for item in items: if isinstance(item, tuple): D[item[0]] += item[1] else: D[item] += 1 return D
def __init__(self): self.allmask = 0 self.total_caps = 100 self.caps = defaultdict(list)
def __init__(self, file_name, sheet_name = 'Data', columns = None, default_columns = False, classic_mode = False, **etc): if columns: self.columns = columns[:] elif default_columns: self.columns = default_columns + (columns or []) else: raise IOError, "Columns not specified and default columns not selected!" if len(self.columns) < len(set([x.lower() for x in self.columns])): import defaultdict counts = defaultdict(int) for col in self.columns: counts[col] += 1 raise ValueError, "Column titles appear more than once: %s" % [k for (k, v) in counts.items() if v > 1] if classic_mode: import mzSpreadsheetClassic self.__class__ = mzSpreadsheetClassic.XLSheetWriter mzSpreadsheetClassic.XLSheetWriter.__init__(self, file_name, sheet_name, columns, default_columns) elif file_name.lower().endswith('.xls'): self.__class__ = XLSWriter XLSWriter.__init__(self, file_name, sheet_name, self.columns, **etc) elif file_name.lower().endswith('.xlsx'): self.__class__ = XLSXWriter XLSXWriter.__init__(self, file_name, sheet_name, self.columns, **etc) else: raise IOError, "Invalid extension on filename %s given to XLSheetWriter." % file_name
def p650(N, p): """Compute the sum of all divisors of B(n) mod p for n = 1, ..., N.""" S = 0 n2f = n2factors(N) for n in tqdm(range(1, N+1)): # Compute factors of B(N) using compact expression factors = defaultdict(int) k = n // 2 j = (n+1) % 2 # 1 if even # Numerator for i in range(k): power = (2 * (k-i) - j) # Numerator elements for prime, mult in n2f[n-i].items(): factors[prime] += mult * power # Denominator elements for prime, mult in n2f[i+1].items(): factors[prime] -= mult * power # Compute D(N) given p2 S = (S + compute_sum_of_divisors(factors, p)) % p return S
def findAllWords(data): matrix = defaultdict(int) for doc in data: words = doc.words for word in words: matrix[word] += 1 return matrix
def __init__(self, actions): self.width = 5 self.height = 5 self.learning_rate = 0.01 self.discount_factor = 0.9 self.actions = actions self.epsilon = 0.1 # self.value_tables = defaultdict(float)
def __init__(self, datasetroot=kDatasetPath, datasetname='training', csvname='reference.csv'): self._datasetroot = datasetroot self._datasetname = datasetname self._csvname = csvname self._filelist = [] self._labels = [] self._cases = defaultdict(list)
def creport(drange, dcrange): def table(what, fn): print "\n" + what print ' '*8, ' '.join([' '+pct(dc, glen) for dc in dcrange]) for d in drange: print '%s (%2d)' % (pct(d, glen), d), for dc in dcrange: print '%5s' % fn(cluster(neighbors, d, dc)), print print '\nNearest neighbor must be closer than this percentage (places). ' print 'Each column: all genomes in cluster within this percentage of each other.' table("Number of clusters", len) cluster1 = cluster(neighbors, 8, 15) ## splits Cleora print '\nNumber of clusters of different sizes:', showh(len(c) for c in cluster1) M, T = defaultdict(int), defaultdict(int) for c in cluster1: M[margin(c)] += 1; T[margin(c)] += len(c) for x in M: print '%d\t%d\t%d'% (x,M[x],T[x]) print '\nMargins', showh(M) for c in cluster1: if margin(c) <= 16: print showc(c) print '\nScatter plot of cluster diameter vs. margin.' for c in cluster1: if diameter(c) > 0: pass #print '%d\t%d' % (diameter(c), margin(c)) print '\nDifference from cluster(neighbors, 11, 14):' #table(lambda cl: pct(len(cluster1)-compare(cluster1, cl),max(len(cluster1),len(cl)))) print '\nNumber of clusters witth more than one species name:' #table(lambda cl: sum(nspecies(c) > 1 for c in cl)) def pct_near_another(clusters, P=1.25): total = 0 for c in clusters: d = diameter(c) for g in c: for g2 in neighbors[g]: if g2 not in c and dist(g, g2) < P*d: total += 1 return pct(total, n) def f(P): print '\nPercent of individuals within %.2f*diameter of another cluster.'%P table(lambda cl: pct_near_another(cl, P))
def scorer(strand, orfs, freq_reads): if strand == 'plus': orf_starts_plus = set([x for x in orfs.keys()]) negatives_plus = defaultdict(lambda:1, {x:[] for x in range(genome_l)}) for pos in range(genome_l-1): try: no_go = set([pos+x for x in range(-4,3)]) if pos -2 in orf_starts_plus: orfs[pos-2].extend([x for x in freq_reads.loc['%s-plus' %(pos)]]) elif len(no_go.intersection(orf_starts_plus)) > 0: continue else: if True in (x in genome[pos-3:pos+9] for x in ['TAG', 'TGA', 'TAA']): continue else: negatives_plus[pos].extend([x for x in freq_reads.loc['%s-plus' %(pos)]]) except: print(pos, strand) continue return orfs_plus, negatives_plus if strand == 'min': orf_starts_min = set([x for x in orfs.keys()]) negatives_min = defaultdict(lambda:1, {x:[] for x in range(genome_l)}) for pos in range(genome_l-1): try: no_go = set([pos+x for x in range(-1,6)]) if pos +3 in orf_starts_min: orfs[pos+3].extend([x for x in freq_reads.loc['%s-min' %(pos)]]) elif len(no_go.intersection(orf_starts_min)) > 0: continue else: if True in (x in revcom(genome[pos-9:pos+3]) for x in ['TAG', 'TGA', 'TAA']): continue else: negatives_min[pos].extend([x for x in freq_reads.loc['%s-min' %(pos)]]) except: print(pos, strand) continue return orfs_min, negatives_min
def sreport(species): SS = defaultdict(int) print for s in set(species): c = [g for g in range(n) if species[g] == s] d = diameter(c) if d > 14: if d==glen: d = '>25' print 'diameter %s for %s (%d elements)' % (d, s, len(c)) SS[d] += 1 print 'Diameters of %d labelled clusters: %s' % (len(set(species)), showh(SS))
def domain_frequencies(documents): """ Given set of documents, creates a dict with frequency count of each domain (in url) found in documents. """ domains = defaultdict(int) for doc in documents: if doc.get("url"): domain = get_domain(doc.get("url")) domains[domain] += 1 return domains
def topSolution(self, numCourses, prerequisites): import collections import defaultdict graph = defaultdict(list) for u, v in prerequisites: graph[u].append(v) # 0 = Unknown, 1 = visiting, 2 = visited visited = [0] * numCourses path = [] for i in range(numCourses): if not self.dfs(graph, visited, i, path): return [] return path
def run(self): artist_count = defaultdict(int) for t in self.input(): with t.open('r') as in_file: for line in in_file: _, artist, track = line.strip().split() artist_count[artist] += 1 with self.output().open('w') as out_file: for artist, count in six.iteritems(artist_count): out_file.write('{}\t{}\n'.format(artist, count))
def gardenNoAdj(self, N: int, paths: List[List[int]]) -> List[int]: G = defaultdict(list) for path in paths: G[path[0]].append(path[1]) G[path[1]].append((path[0])) colored = defaultdict() def dfs(G, V, colored): colors = [1, 2, 3, 4] for neighbour in G[V]: if neighbour in colored: if colored[neighbour] in colors: colors.remove(colored[neighbour]) colored[V] = colors[0] for V in range(1, N + 1): dfs(G, V, colored) ans = [] for V in range(len(colored)): ans.append(colored[V + 1]) return ans
def findDiagonalOrder(self, matrix: List[List[int]]) -> List[int]: if not matrix or not matrix[0]: return None dic = defaultdict(list) res = [] m, n = len(matrix), len(matrix[0]) for i in range(m): for j in range(n): dic[i+j].append(matrix[i][j]) for k in sorted(dic.keys()): if k % 2 == 0: dic[k].reverse() res += dic[k] return res
def solve(self, t, E): es = defaultdict(list) for e in E: es[e[0]].append(e[1]) stack = [1] visited = set([1]) while stack: u = stack.pop() for v in es[u]: if not v in visited: stack.append(v) visited.add(v) return 1 if t in visited else 0
def solve(names): trie = {} res = [] for name in names: node = trie printed = False for i, x in enumerate(name, start=1): if x not in node: if not printed: res.append(name[:i]) printed = True node[x] = defaultdict(int) node = node[x] node['count'] += 1 if not printed: res.append (name + (' '+str(node['count']) if node['count']!=1 else '')) return res
def __init__(self, report_file, columns, default_columns, **kwargs): self.file_name = report_file if columns: self.columns = columns[:] elif default_columns: self.columns = default_columns + (columns or []) self.extraArgs = kwargs if len(self.columns) < len(set([x.lower() for x in self.columns])): import defaultdict counts = defaultdict(int) for col in self.columns: counts[col] += 1 raise ValueError("Column titles appear more than once: %s" % [k for (k, v) in list(counts.items()) if v > 1]) self.data = []
def invalidTransactions(self, transactions): """ :type transactions: List[str] :rtype: List[str] """ users = defaultdict(list) for tran in transactions: usr, ts, amt, cty = [_ for _ in tran.split(',')] ts = int(ts) amt = int(amt) users[usr].append((ts, amt, cty)) res = [] for usr in users: #latest = deque([(float('-inf'), 0, '')]) #print latest left = right = 0 users[usr].sort() i = 0 for ts, amt, cty in users[usr]: #print ts, amt, cty if amt > 1000: res.append(','.join([usr, str(ts), str(amt), cty])) continue while ts - users[usr][left][0] > 60: left += 1 while right < len(users[usr]) and users[usr][right][0] - ts < 60: right += 1 for record in users[usr][left:right]: if cty != record[2]: res.append(','.join([usr, str(ts), str(amt), cty])) break i += 1 return res
def populate(self, pdict: PDict, predicate): self.adj = defaultdict(set) words = sorted( pdict.cmu.values(), key=lambda w: list(reversed(w.prns[0].stressless_repr()))) for i, word1 in enumerate(words): if i % 1000 == 0: print(i * 100.0 / len(words)) for word2 in words[i:]: #if word2.word == 'school': # import pdb; pdb.set_trace() # print(list(reversed(word1.prns[0].stressless_repr().split()))) # print(word1, word1.prns[0].stressless_repr()[-1]) # print(word2, word2.prns[0].stressless_repr()[-1]) # print('') if word1.prns[0].stressless_repr( )[-2:] != word2.prns[0].stressless_repr()[-2:]: break if predicate(word1, word2): self.adj[word1].add(word2) self.adj[word2].add(word1)
def orf_finder(genome): seq = genome.seq min_orf = 3 max_orf = 10000 orfs = defaultdict() starts = [codon.start() for codon in re.finditer('ATG|GTG|TTG', seq)] for x in starts: stops = [codon.start()+x for codon in re.finditer('TAG|TGA|TAA', seq[x:x+max_orf])] for y in stops: if (y-x) > 0 and (y-x) % 3 == 0: if (y-x) < min_orf: break if min_orf <= (y-x) <= max_orf: orfs[x]=[y+2] break elif (y-x) >max_orf: break return orfs
def solve(names): s = set() d = defaultdict(int) l = list() for name in names: if name in d: d[name] += 1 l.append(name+" "+str(d[name])) else: d[name] = 1 t = "" inserted = False; for i in range(len(name)): t += name[i:i+1] if t not in s and not inserted: inserted = True l.append(t) s.add(t) if not inserted: l.append(name) return l
{'date': '07/02/2012', 'address': '5800 E 58TH'} {'date': '07/02/2012', 'address': '5645 N RAVENSWOOD'} {'date': '07/02/2012', 'address': '1060 W ADDISON'} 07/03/2012 {'date': '07/03/2012', 'address': '2122 N CLARK'} 07/04/2012 {'date': '07/04/2012', 'address': '5148 N CLARK'} {'date': '07/04/2012', 'address': '1039 W GRANVILLE'} In [16]: ...: from collections import defaultdict In [18]: %paste rows_by_date = defaultdict(list) for row in rows: rows_by_date[row['date']].append(row) ## -- End pasted text -- In [19]: ...: print(rows_by_date) defaultdict(<class 'list'>, {'07/02/2012': [{'date': '07/02/2012', 'address': '5 800 E 58TH'}, {'date': '07/02/2012', 'address': '5645 N RAVENSWOOD'}, {'date': ' 07/02/2012', 'address': '1060 W ADDISON'}], '07/03/2012': [{'date': '07/03/2012' , 'address': '2122 N CLARK'}], '07/04/2012': [{'date': '07/04/2012', 'address': '5148 N CLARK'}, {'date': '07/04/2012', 'address': '1039 W GRANVILLE'}], '07/01/ 2012': [{'date': '07/01/2012', 'address': '5412 N CLARK'}, {'date': '07/01/2012' , 'address': '4801 N BROADWAY'}]})
fromm collections import defaultdict incomes = [('Books', 1250.00), ('Books', 1300.00), ('Books', 1420.00), ('Tutorials', 560.00), ('Tutorials', 630.00), ('Tutorials', 750.00), ('Courses', 2500.00), ('Courses', 2430.00), ('Courses', 2750.00),] dd = defaultdict(float) for product, income in incomes: dd[product] += income for product, income in dd.items(): print(f'Total income for {product}: ${income:,.2f}')
import defaultdict #Stuart- I had to edit the code myself because I received an indentation error on your documentation in # quotes. Be careful because python is very particular about indentation. The way you called the print function # also does not work for python 3. Python 3 calls it differently than 2.7 def anagram(name): # get every word from the list and store them in wordbank wordbank = [] with open(name) as file: for line in file: wordbank.append(line.rstrip()) ''' use defaultdict fun. to creat a dic and the key is the same characters that the words share. The value is every word that shares the same char. ''' dict_anagram = defaultdict(list) for word in wordbank: key = ''.join(sorted(word)) dict_anagram[key].append(word) # give length an initial value as 0 length = 0 # find the largest length for word1, word2 in anadict.items(): if len(word2) > length: length = len(word2) # print the anagrams with the largest length for word1, word2 in anadict.items(): if len(word2) > length-1: print word1, word2
# 기본 딕셔너리 stats = {} key = 'my_counter' if key not in stats: stats[key] = 0 stats[key] += 1 from collections from defaultdict stats = defaultdict(int) stats['my_counter'] += 1
#!/usr/bin/python from math import sqrt from math import cos import os import math import collections import codecs feom collections import defaultdict os.chdir("F:\\college\\Sem7\\CF\\ml-100k\\ml-100k"); vmin = 1000.0 vmax = 0.0 movieVariances = defaultdict() movieRatings = defaultdict(list) def avg(ratings): s = 0.0 for i in ratings: s = s + i return (s * 1.0)/len(ratings) def variance(ratings): s = 0.0 a = avg(ratings) for i in ratings: s = s + ((a-i)**2) return (s * 1.0)/len(ratings) def computeNearestNeighbor(users, username): """creates a sorted list of users based on their distance to username""" distances = []
# Advent of Code 2019 Day 18 # Many-Worlds Interpretation # planning import defaultdict import sys # define variables https://topaz.github.io/paste/#XQAAAQCnGQAAAAAAAAAzHIoib6pXbueH4X9F244lVRDcOZab5q1+VXY/ex42qR7D/JhOUAl0PRlKyZmMcX/t+JUQyym/jh2oG/1cutq3qMxmEFpEjHMJSSEEfDZRxC+e6/mi7CaFwh8r1QUUHa86RR8jiUxbzm+MWYJ9+ADHFKF0mdEWUJ5JmYhvst1+9wbHQaSR4QOsA59OhvWDAnlvmnnOG9Pa+cpYBE/81pFfWo5cWA9Z+Y0du2hwZ0o8GZzmXyMprlbe3wWClBSg4wc/YuB9229yePM0JLgzdvtqY15IRQcMxUmyBLDRXv1c2oUHVCuSNwjb90gG22nUDxkFlKCjAdySTfw4ACa/U82jdm/KrgZeigxUi0fbkLvBVB+kRzknSMafKM/aEdhlHAlfBKYP9NW3f5xkLyzRt8Rwfwgn8zsdJIdV0b9v6zWQLlUHRA6tfYB0RBiBKmIHkyjes3V1giRYoq9UyCDFBsmMVeLZ39gdcYLZpyApTvb8eUKZ5/WL9I7xmRUunpNalU00GmebZozPLsu7qeJh/0EOJMQ3yG0fo1gcoO/YsV2TUnYRJ4aFKgRZni0rNtoyhf7UpUdDR+NB1iDWP4omHP8YF1RxA1YcEi2V8YqyhJE7IIOr4dLxSQQZrzGb16K+zqH0jvVAUby9crfDGJgIyx5tsSnOU39Yw4WU4Vs6DT0It8Dr5QAjpFEquTrz0B08/vAk26XEfuMJJOfHVCI0PWNXhS5c2MrhAdSCfFBCVnovAZTXVcQixljtyAHdFsmHMt6eQItROPAAh3AOFHkLEPqBMEawOVQ2c3nYznIaWIf16cDyaj1SlXHM2BkxBQauvjwWzdnlgoEP/HdkDsH4f3FGbkWxiiqMogIbF+G85H/f4IU2wksMiTsRjP7vp33Nsn8Tc8DdEkv6SH5oJ29DZ0HU+aXzV9A69qaRX7R2YYhPkZEbMkuV5dUxwREkJsQmXsHF7zo2L9Ptnw43YwlfNy51kilROISWl2T3XpBs54MGqMuDRNBXTNcMTQrWl2o8g9hOvALc66FuPhp9JXiRRI/Vk9HDs9iaTTXh/gYfWv8vwoQKBTVNFglqL61mO8D3t3HgbDcaiEqsWU+UfiBNY0n0+T+iY/x6iUqKJpTbRtr9BsQTko3kKRc2kPDckqUH2ZqxHeVWrPJHfpjDTYuJHwV44uQXxx0WyxCigoakb67/X6zc8KG5YxMXlFZmW2UvraYDLTf9TP7E1Y69UKu2CwhTqpqqU0v85GmjUcjJmyZDftrLsXlrAlDnDNuoH7BXGCznqRCBcnOMgAvmRcf+HmcPXF3wcaojXlelzwafTINXTLDvmBcGzo1XJY3xQbqA8eLjmR1E3EdwxP0trxLOZn2+Qtbow8pConCwWIwo+fMLJEBWRYyS2BehncUc9TRZQpUvqv+mY+UrDvR3UXnkf5EjHtrPDcdgH3QKjK4F5Q9hc3EYJXkAU9b/8E2Di29uUPFGobaBurkl5jWgDsM2BmIfcR4SmsXsRSdf91D2UTr3Wf6f5UiNrxqblJT985hlaRpTr/nzGjOnbBtEOH1eqv6ksUK5W8/8brMIwsx6NPmBClLB/7NwknRGhl6fD6p4SKyD7Gqj2iKzpCKCmQ46+q9efvzaOpKpc/uIHcWv0Hvu1LS3docaAgSG/nEAp4H1vQEyxww7bSCBoDaHKBUmdDtC2jNqjZz/xc71w4RM9aynhOH2rn5oK42LLtzjdyiT0J+xQo2t7kN2m+jxCl4I6w+f6JgWNoVLU9I1vyd+WiMeft6flc2a7Ntj3BC+6/7EX6Jx3OAFSklMLlmAAx4G35dp0Mbw6U6xSL3/eSif11ntphgyibHa7/PfMFOUM/PzlD76cifC66k9J1ZkaMMadQqfTYNeIQVXtqitU5gxzONwG5ykeaXB6MupQ0c9/yR0esMj35/fKfjTIOEC5lYvD9trZntGSK9jwGbQxNqqiY0ooXPbE2aFbB/z7fAzycE5QRumLm6Bhfx2t4bSgZRH+YWPB6BWkcLcfwKkKq2onzXfldiwT1GSguUevJSAAV7b2UIiEzY75tUhbzA0oZDlywx8i9FOIWEwcMqAvAlp/km3ARfZGM/lcQOa3DHUhw/D54S8JOhnEu4uqt5L9FiInnZJWyAdmZMTz7sL6pv+QWLyCTtnPTK06roWzgXIKA5kFD/j1LFVxhsYE282FOksj4/NcfUlB0bYvDVf3/A2haXFLrywp+N3qbZJ3FfWGJIERLlXNanxO7a30lO4RoM1DTdAe7rU2gLV6PdDxmh7v1XnkpfshndhahUOVHSwZSfb0YcOrlH/JzqGXwsZm0b5bzv/7GE0GA== grid = defaultdict(int) keys = {} gate = {} starting_point = [] #find shortest path https://topaz.github.io/paste/#XQAAAQCnGQAAAAAAAAAzHIoib6pXbueH4X9F244lVRDcOZab5q1+VXY/ex42qR7D/JhOUAl0PRlKyZmMcX/t+JUQyym/jh2oG/1cutq3qMxmEFpEjHMJSSEEfDZRxC+e6/mi7CaFwh8r1QUUHa86RR8jiUxbzm+MWYJ9+ADHFKF0mdEWUJ5JmYhvst1+9wbHQaSR4QOsA59OhvWDAnlvmnnOG9Pa+cpYBE/81pFfWo5cWA9Z+Y0du2hwZ0o8GZzmXyMprlbe3wWClBSg4wc/YuB9229yePM0JLgzdvtqY15IRQcMxUmyBLDRXv1c2oUHVCuSNwjb90gG22nUDxkFlKCjAdySTfw4ACa/U82jdm/KrgZeigxUi0fbkLvBVB+kRzknSMafKM/aEdhlHAlfBKYP9NW3f5xkLyzRt8Rwfwgn8zsdJIdV0b9v6zWQLlUHRA6tfYB0RBiBKmIHkyjes3V1giRYoq9UyCDFBsmMVeLZ39gdcYLZpyApTvb8eUKZ5/WL9I7xmRUunpNalU00GmebZozPLsu7qeJh/0EOJMQ3yG0fo1gcoO/YsV2TUnYRJ4aFKgRZni0rNtoyhf7UpUdDR+NB1iDWP4omHP8YF1RxA1YcEi2V8YqyhJE7IIOr4dLxSQQZrzGb16K+zqH0jvVAUby9crfDGJgIyx5tsSnOU39Yw4WU4Vs6DT0It8Dr5QAjpFEquTrz0B08/vAk26XEfuMJJOfHVCI0PWNXhS5c2MrhAdSCfFBCVnovAZTXVcQixljtyAHdFsmHMt6eQItROPAAh3AOFHkLEPqBMEawOVQ2c3nYznIaWIf16cDyaj1SlXHM2BkxBQauvjwWzdnlgoEP/HdkDsH4f3FGbkWxiiqMogIbF+G85H/f4IU2wksMiTsRjP7vp33Nsn8Tc8DdEkv6SH5oJ29DZ0HU+aXzV9A69qaRX7R2YYhPkZEbMkuV5dUxwREkJsQmXsHF7zo2L9Ptnw43YwlfNy51kilROISWl2T3XpBs54MGqMuDRNBXTNcMTQrWl2o8g9hOvALc66FuPhp9JXiRRI/Vk9HDs9iaTTXh/gYfWv8vwoQKBTVNFglqL61mO8D3t3HgbDcaiEqsWU+UfiBNY0n0+T+iY/x6iUqKJpTbRtr9BsQTko3kKRc2kPDckqUH2ZqxHeVWrPJHfpjDTYuJHwV44uQXxx0WyxCigoakb67/X6zc8KG5YxMXlFZmW2UvraYDLTf9TP7E1Y69UKu2CwhTqpqqU0v85GmjUcjJmyZDftrLsXlrAlDnDNuoH7BXGCznqRCBcnOMgAvmRcf+HmcPXF3wcaojXlelzwafTINXTLDvmBcGzo1XJY3xQbqA8eLjmR1E3EdwxP0trxLOZn2+Qtbow8pConCwWIwo+fMLJEBWRYyS2BehncUc9TRZQpUvqv+mY+UrDvR3UXnkf5EjHtrPDcdgH3QKjK4F5Q9hc3EYJXkAU9b/8E2Di29uUPFGobaBurkl5jWgDsM2BmIfcR4SmsXsRSdf91D2UTr3Wf6f5UiNrxqblJT985hlaRpTr/nzGjOnbBtEOH1eqv6ksUK5W8/8brMIwsx6NPmBClLB/7NwknRGhl6fD6p4SKyD7Gqj2iKzpCKCmQ46+q9efvzaOpKpc/uIHcWv0Hvu1LS3docaAgSG/nEAp4H1vQEyxww7bSCBoDaHKBUmdDtC2jNqjZz/xc71w4RM9aynhOH2rn5oK42LLtzjdyiT0J+xQo2t7kN2m+jxCl4I6w+f6JgWNoVLU9I1vyd+WiMeft6flc2a7Ntj3BC+6/7EX6Jx3OAFSklMLlmAAx4G35dp0Mbw6U6xSL3/eSif11ntphgyibHa7/PfMFOUM/PzlD76cifC66k9J1ZkaMMadQqfTYNeIQVXtqitU5gxzONwG5ykeaXB6MupQ0c9/yR0esMj35/fKfjTIOEC5lYvD9trZntGSK9jwGbQxNqqiY0ooXPbE2aFbB/z7fAzycE5QRumLm6Bhfx2t4bSgZRH+YWPB6BWkcLcfwKkKq2onzXfldiwT1GSguUevJSAAV7b2UIiEzY75tUhbzA0oZDlywx8i9FOIWEwcMqAvAlp/km3ARfZGM/lcQOa3DHUhw/D54S8JOhnEu4uqt5L9FiInnZJWyAdmZMTz7sL6pv+QWLyCTtnPTK06roWzgXIKA5kFD/j1LFVxhsYE282FOksj4/NcfUlB0bYvDVf3/A2haXFLrywp+N3qbZJ3FfWGJIERLlXNanxO7a30lO4RoM1DTdAe7rU2gLV6PdDxmh7v1XnkpfshndhahUOVHSwZSfb0YcOrlH/JzqGXwsZm0b5bzv/7GE0GA== for y, row in enumerate(lines): for x, cell in enumerate(row): grid[(x, y)] = cell if cell == "@": pos = (x, y) elif cell >= "a" and cell <= "z": keys[cell] = (x, y) elif cell >= "A" and cell <= "Z": doors[cell.lower()] = (x, y) return grid, doors, keys, pos with open("input.txt") as f: for line in f:
def positionMap(l): positions = defaultdict(list) for i,p in enumerate(l): positions[p].append(i) return positions
def findAllLinks(data): sites = defaultdict(int) for document in data: for link in document.websites: sites[link] += 1 return sites
Expected Output: defaultdict(<class 'set'>, {'Class-VII': {2}, 'Class-VI': {2}, 'Class-VIII': {3}, 'Class-V': {1}}) Click me to see the sample solution METHOD 1: lists= ['Class-V', 'Class-VI', 'Class-VII', 'Class-VIII'] list_two=[1,2,3,4] c=zip(lists,list_two) print(dict(c)) METHOD 2: from collections import defaultdict class_list = ['Class-V', 'Class-VI', 'Class-VII', 'Class-VIII'] id_list = [1, 2, 2, 3] temp = defaultdict(set) for c, i in zip(class_list, id_list): print(c,i) temp[c].add(i) print(temp) ----------------------------------------------------------------------------------- # 37. Write a Python program to replace dictionary values with their sum. Go to the editor # Click me to see the sample solution def bank_statements(*args): for d in args: n1=d.pop('Current Balance') n2=d.pop('Fixed Account') d['Overall_Account']=n1+n2 yield d
class Edit: defer = defaultdict(dict) def __init__(self, view): self.view = view self.steps = []
import defaultdict import datetime # Create a defaultdict of an integer: monthly_total_rides monthly_total_rides = defaultdict(int) # Loop over the list daily_summaries for daily_summary in daily_summaries: # Convert the service_date to a datetime object service_datetime = datetime.strptime(daily_summary[0], '%m/%d/%Y') # Add the total rides to the current amount for the month monthly_total_rides[service_datetime.month] += int(daily_summary[4]) # Print monthly_total_rides print(monthly_total_rides)
In [8]: from collections import defaultdict In [9]: age_groups = defaultdict(list) In [10]: for person in people: ...: age_groups[person.age].append(person) ...: In [11]: for k in age_groups: ...: print(k, age_groups[k]) ...: 40 [40, 40] 18 [18, 18, 18] 42 [42] 25 [25] 23 [23] 80 [80] 67 [67]
# -*- coding: utf-8 -*-# Maximum Entropy Part-of-Speech Tagger for NLTK (Natural Language Toolkit)# Author: Arne Neumann# Licence: GPL 3 #__docformat__ = 'epytext en' """A I{part-of-speech tagger} that uses NLTK's build-in L{Maximum Entropymodels<nltk.MaxentClassifier>} to find the most likely I{part-of-speechtag} (POS) for each word in a given sequence.The tagger will be trained on a corpus of tagged sentences. For every wordin the corpus, a C{tuple} consisting of a C{dictionary} of features fromthe word's context (e.g. preceding/succeeding words and tags, wordprefixes/suffixes etc.) and the word's tag will be generated.The maximum entropy classifier will learn a model from these tuples thatwill be used by the tagger to find the most likely POS-tag for any givenword, even unseen ones.The tagger and the featuresets chosen for training are implemented as describedin Ratnaparkhi, Adwait (1996). A Maximum Entropy Model for Part-Of-SpeechTagging. In Proceedings of the ARPA Human Language Technology Workshop. Pages250-255.Usage notes:============Please install the MEGAM package (http://hal3.name/megam),otherwise training will take forever.To use the demo, please install either 'brown' or 'treebank' with:: import nltk nltk.download()in the Python interpreter. Proper usage of demo() and all other functions andmethods is described below.""" import timeimport refrom collections import defaultdict from nltk import TaggerI, FreqDist, untag, config_megamfrom nltk.classify.maxent import MaxentClassifier PATH_TO_MEGAM_EXECUTABLE = "/usr/bin/megam"config_megam(PATH_TO_MEGAM_EXECUTABLE) class MaxentPosTagger(TaggerI): """ MaxentPosTagger is a part-of-speech tagger based on Maximum Entropy models. """ def train(self, train_sents, algorithm='megam', rare_word_cutoff=5, rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3, **cutoffs): """ MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged sentences. @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str}) @param train_sents: A list of tagged sentences. Each sentence is represented by a list of tuples. Each tuple holds two strings, a word and its tag, e.g. ('company','NN'). @type algorithm: C{str} @param algorithm: The algorithm that is used by L{nltk.MaxentClassifier.train()} to train and optimise the model. It is B{strongly recommended} to use the C{LM-BFGS} algorithm provided by the external package U{megam<http://hal3.name/megam/>} as it is much faster and uses less memory than any of the algorithms provided by NLTK (i.e. C{GIS}, C{IIS}) or L{scipy} (e.g. C{CG} and C{BFGS}). @type rare_word_cutoff: C{int} @param rare_word_cutoff: Words with less occurrences than C{rare_word_cutoff} will be treated differently by L{extract_feats} than non-rare words (cf. Ratnaparkhi 1996). @type rare_feat_cutoff: C{int} @param rare_feat_cutoff: ignore features that occur less than C{rare_feat_cutoff} during training. @type uppercase_letters: C{regex} @param uppercase_letters: a regular expression that covers all uppercase letters of the language of your corpus (e.g. '[A-Z]' for German) @type trace: C{int} @param trace: The level of diagnostic output to produce. C{0} doesn't produce any output, while C{3} will give all the output that C{megam} produces plus the time it took to train the model. @param cutoffs: Arguments specifying various conditions under which the training should be halted. When using C{MEGAM}, only C{max_iter} should be relevant. For other cutoffs see L{nltk.MaxentClassifier} - C{max_iter=v}: Terminate after C{v} iterations. """ self.uppercase_letters = uppercase_letters self.word_freqdist = self.gen_word_freqs(train_sents) self.featuresets = self.gen_featsets(train_sents, rare_word_cutoff) self.features_freqdist = self.gen_feat_freqs(self.featuresets) self.cutoff_rare_feats(self.featuresets, rare_feat_cutoff) t1 = time.time() self.classifier = MaxentClassifier.train(self.featuresets, algorithm, trace, **cutoffs) t2 = time.time() if trace > 0: print "time to train the classifier: {0}".format(round(t2-t1, 3)) def gen_feat_freqs(self, featuresets): """ Generates a frequency distribution of joint features (feature, tag) tuples. The frequency distribution will be used by the tagger to determine which (rare) features should not be considered during training (feature cutoff). This is how joint features look like:: (('t-2 t-1', 'IN DT'), 'NN') (('w-2', '<START>'), 'NNP') (('w+1', 'of'), 'NN') @type featuresets: {list} of C{tuples} of (C{dict}, C{str}) @param featuresets: a list of tuples that contain the featureset of a word from the training set and its POS tag. @rtype: C{FreqDist} @return: a L{frequency distribution<nltk.FreqDist()>}, counting how often each (context information feature, tag) tuple occurs in the training sentences. """ features_freqdist = defaultdict(int) for (feat_dict, tag) in featuresets: for (feature, value) in feat_dict.items(): features_freqdist[ ((feature, value), tag) ] += 1 return features_freqdist def gen_word_freqs(self, train_sents): """ Generates word frequencies from the training sentences for the feature extractor. @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str}) @param train_sents: A list of tagged sentences. @rtype: C{FreqDist} @return: a L{frequency distribution<nltk.FreqDist()>}, counting how often each word occurs in the training sentences. """ word_freqdist = FreqDist() for tagged_sent in train_sents: for (word, _tag) in tagged_sent: word_freqdist[word] += 1 return word_freqdist def gen_featsets(self, train_sents, rare_word_cutoff): """ Generates featuresets for each token in the training sentences. @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str}) @param train_sents: A list of tagged sentences. @type rare_word_cutoff: C{int} @param rare_word_cutoff: Words with less occurrences than C{rare_word_cutoff} will be treated differently by L{extract_feats} than non-rare words (cf. Ratnaparkhi 1996). @rtype: {list} of C{tuples} of (C{dict}, C{str}) @return: a list of tuples that contains the featureset of a token and its POS-tag. """ featuresets = [] for tagged_sent in train_sents: history = [] untagged_sent = untag(tagged_sent) for (i, (_word, tag)) in enumerate(tagged_sent): featuresets.append( (self.extract_feats(untagged_sent, i, history, rare_word_cutoff), tag) ) history.append(tag) return featuresets def cutoff_rare_feats(self, featuresets, rare_feat_cutoff): """ Cuts off rare features to reduce training time and prevent overfitting. Example ======= Let's say, the suffixes of this featureset are too rare to learn. >>> featuresets[46712] ({'suffix(1)': 't', 'prefix(1)': 'L', 'prefix(2)': 'Le', 'prefix(3)': 'Lem', 'suffix(3)': 'ont', 'suffix(2)': 'nt', 'contains-uppercase': True, 'prefix(4)': 'Lemo', 'suffix(4)': 'mont'}, 'NNP') C{cutoff_rare_feats} would then remove the rare joint features:: (('suffix(1)', 't'), 'NNP') (('suffix(3)', 'ont'), 'NNP') ((suffix(2)': 'nt'), 'NNP') (('suffix(4)', 'mont'), 'NNP') and return a featureset that only contains non-rare features: >>> featuresets[46712] ({'prefix(1)': 'L', 'prefix(2)': 'Le', 'prefix(3)': 'Lem', 'contains-uppercase': True, 'prefix(4)': 'Lemo'}, 'NNP') @type featuresets: {list} of C{tuples} of (C{dict}, C{str}) @param featuresets: a list of tuples that contain the featureset of a word from the training set and its POS tag @type rare_feat_cutoff: C{int} @param rare_feat_cutoff: if a (context information feature, tag) tuple occurs less than C{rare_feat_cutoff} times in the training set, then its corresponding feature will be removed from the C{featuresets} to be learned. """ never_cutoff_features = set(['w','t']) for (feat_dict, tag) in featuresets: for (feature, value) in feat_dict.items(): feat_value_tag = ((feature, value), tag) if self.features_freqdist[feat_value_tag] < rare_feat_cutoff: if feature not in never_cutoff_features: feat_dict.pop(feature) def extract_feats(self, sentence, i, history, rare_word_cutoff=5): """ Generates a featureset from a word (in a sentence). The features were chosen as described in Ratnaparkhi (1996) and his Java software package U{MXPOST<ftp://ftp.cis.upenn.edu/pub/adwait/jmx>}. The following features are extracted: - features for all words: last tag (C{t-1}), last two tags (C{t-2 t-1}), last words (C{w-1}) and (C{w-2}), next words (C{w+1}) and (C{w+2}) - features for non-rare words: current word (C{w}) - features for rare words: word suffixes (last 1-4 letters), word prefixes (first 1-4 letters), word contains number (C{bool}), word contains uppercase character (C{bool}), word contains hyphen (C{bool}) Ratnaparkhi experimented with his tagger on the Wall Street Journal corpus (Penn Treebank project). He found that the tagger yields better results when words which occur less than 5 times are treated as rare. As your mileage may vary, please adjust L{rare_word_cutoff} accordingly. Examples ======== 1. This is a featureset extracted from the nonrare (word, tag) tuple ('considerably', 'RB') >>> featuresets[22356] ({'t-1': 'VB', 't-2 t-1': 'TO VB', 'w': 'considerably', 'w+1': '.', 'w+2': '<END>', 'w-1': 'improve', 'w-2': 'to'}, 'RB') 2. A featureset extracted from the rare tuple ('Lemont', 'NN') >>> featuresets[46712] ({'suffix(1)': 't', 'prefix(1)': 'L', 'prefix(2)': 'Le', 'prefix(3)': 'Lem', 'suffix(3)': 'ont', 'suffix(2)': 'nt', 'contains-uppercase': True, 'prefix(4)': 'Lemo', 'suffix(4)': 'mont'}, 'NNP') @type sentence: C{list} of C{str} @param sentence: A list of words, usually a sentence. @type i: C{int} @param i: The index of a word in a sentence, where C{sentence[0]} would represent the first word of a sentence. @type history: C{int} of C{str} @param history: A list of POS-tags that have been assigned to the preceding words in a sentence. @type rare_word_cutoff: C{int} @param rare_word_cutoff: Words with less occurrences than C{rare_word_cutoff} will be treated differently than non-rare words (cf. Ratnaparkhi 1996). @rtype: C{dict} @return: a dictionary of features extracted from a word's context. """ features = {} hyphen = re.compile("-") number = re.compile("\d") uppercase = re.compile(self.uppercase_letters) #get features: w-1, w-2, t-1, t-2. #takes care of the beginning of a sentence if i == 0: #first word of sentence features.update({"w-1": "<START>", "t-1": "<START>", "w-2": "<START>", "t-2 t-1": "<START> <START>"}) elif i == 1: #second word of sentence features.update({"w-1": sentence[i-1], "t-1": history[i-1], "w-2": "<START>", "t-2 t-1": "<START> %s" % (history[i-1])}) else: features.update({"w-1": sentence[i-1], "t-1": history[i-1], "w-2": sentence[i-2], "t-2 t-1": "%s %s" % (history[i-2], history[i-1])}) #get features: w+1, w+2. takes care of the end of a sentence. for inc in [1, 2]: try: features["w+%i" % (inc)] = sentence[i+inc] except IndexError: features["w+%i" % (inc)] = "<END>" if self.word_freqdist[sentence[i]] >= rare_word_cutoff: #additional features for 'non-rare' words features["w"] = sentence[i] else: #additional features for 'rare' or 'unseen' words features.update({"suffix(1)": sentence[i][-1:], "suffix(2)": sentence[i][-2:], "suffix(3)": sentence[i][-3:], "suffix(4)": sentence[i][-4:], "prefix(1)": sentence[i][:1], "prefix(2)": sentence[i][:2], "prefix(3)": sentence[i][:3], "prefix(4)": sentence[i][:4]}) if hyphen.search(sentence[i]) != None: #set True, if regex is found at least once features["contains-hyphen"] = True if number.search(sentence[i]) != None: features["contains-number"] = True if uppercase.search(sentence[i]) != None: features["contains-uppercase"] = True return features def tag(self, sentence, rare_word_cutoff=5): """ Attaches a part-of-speech tag to each word in a sequence. @type sentence: C{list} of C{str} @param sentence: a list of words to be tagged. @type rare_word_cutoff: C{int} @param rare_word_cutoff: words with less occurrences than C{rare_word_cutoff} will be treated differently than non-rare words (cf. Ratnaparkhi 1996). @rtype: C{list} of C{tuples} of (C{str}, C{str}) @return: a list of tuples consisting of a word and its corresponding part-of-speech tag. """ history = [] for i in xrange(len(sentence)): featureset = self.extract_feats(sentence, i, history, rare_word_cutoff) tag = self.classifier.classify(featureset) history.append(tag) return zip(sentence, history) def demo(corpus, num_sents): """ Loads a few sentences from the Brown corpus or the Wall Street Journal corpus, trains them, tests the tagger's accuracy and tags an unseen sentence. @type corpus: C{str} @param corpus: Name of the corpus to load, either C{brown} or C{treebank}. @type num_sents: C{int} @param num_sents: Number of sentences to load from a corpus. Use a small number, as training might take a while. """ if corpus.lower() == "brown": from nltk.corpus import brown tagged_sents = brown.tagged_sents()[:num_sents] elif corpus.lower() == "treebank": from nltk.corpus import treebank tagged_sents = treebank.tagged_sents()[:num_sents] else: print "Please load either the 'brown' or the 'treebank' corpus." size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] maxent_tagger = MaxentPosTagger() maxent_tagger.train(train_sents) print "tagger accuracy (test %i sentences, after training %i):" % \ (size, (num_sents - size)), maxent_tagger.evaluate(test_sents) print "\n\n" print "classify unseen sentence: ", maxent_tagger.tag(["This", "is", "so", "slow", "!"]) print "\n\n" print "show the 10 most informative features:" print maxent_tagger.classifier.show_most_informative_features(10) if __name__ == '__main__': demo("treebank", 200) #~ featuresets = demo_debugger("treebank", 10000) print "\n\n\n"
import argparse import operator import defaultdict parser = argparse.ArgumentParser() parser.add_argument('regression_result_file', type=argparse.FileType('r')) parser.add_argument('snp_coordinate_file', type=argparse.FileType('r')) parser.add_argument('refgene_file', type=argparse.FileType('r')) parser.add_argument('outfile', type=argparse.FileType('w')) args = parser.parse_args() snp_list = [l.strip().split('\t')[0] for l in args.regression_result_file.readlines()] snp_coords = defaultdict(list) line = args.snp_coordinate_file.readline() while line: fields = line.strip().split('\t') try: snp_coords[fields[1]].append((fields[0], int(fields[2]))) except Exception: pass line = args.snp_coordinate_file.readline() gene_counts = defaultdict(int) line = args.refgene_file.readline() while line: fields = line.strip().split('\t') chrom = fields[2].replace('chr','')
scores_min, genome) log('Library coorection finished') ################# Score the data on nucleotide level all_scores = RET_iTP_scorer('Analysis/RET-iTP_scores_all.csv') log('scoring finished') ############### FIND ALL POSSIBLE ORFs pool = multiprocessing.Pool(processes=2) orfs_plus_coordinates, orfs_min_coordinates_rc = pool.map( orf_finder, [genome.seq, revcom(genome.seq)]) pool.close() orfs_min_coordinates = defaultdict(lambda: 1, { genome_l - x: [genome_l - y[0]] for x, y in orfs_min_coordinates_rc.items() }) log('ORFS located') ############################ Divide data into ORFs and negatives (non-ORFs) handle_orf_scores = 'Analysis/ORF_scores.csv' handle_negatives_scores = 'Analysis/negatives_scores.csv' (orfs_plus_scored, negatives_plus_scored) = scorer('plus', orfs_plus_coordinates, all_scores) (orfs_min_out_scored, negatives_min_scored) = scorer('min', orfs_min_coordinates, all_scores) RET_score_writer(orfs_plus_scored, negatives_plus_scored, orfs_min_out_scored, negatives_min_scored, handle_out_orfs, handle_out_negatives)
class AlphabetDetector: def __init__(self, no_memory=False): self.alphabet_letters = defaultdict(dict) self.no_memory = no_memory