def count_retrieved_messages(mailBox): import re from collections import Counter as counter addresses = [] for mail in mailBox: (date, address, subject) = get_message_info(mail) matches = re.findall(r'<(.+?)>', address) addresses.append("".join(matches)) return counter(addresses), sorted((counter(addresses)).items(), key=lambda kv: kv[1])
def life(cells): """Takes a list of (x,y) points, and returns the next generation of the Game of Life.""" neighbours = counter( [(x + xo, y + yo) for x, y in cells for xo in [-1, 0, 1] for yo in [-1, 0, 1]] ) return counter( [ cell for cell in neighbours if neighbours[cell] == 3 or (neighbours[cell] == 4 and cell in cells) ] )
def ransom_note(magazine, ransom): if len(ransom) > len(magazine): return False magazine_dict = counter(magazine) ransom_dict = counter(ransom) for key, value in ransom_dict.items(): if key not in magazine_dict: return False elif value > magazine_dict[key]: return False return True
def minWindowSub(source, target): if not source or not target: return "" s = counter() t = counter() for i in source: s[i] += 1 for j in target: t[j] += 1 start = 0 end = 0 while start < len(source): for i in range(len(target)):
def scopes_size(scopes: Scopes) -> Counter: """ Calculate the different scope lengths. Parameters ---------- scopes : DefaultDict Dictionary of cells (keys) and their scopes Returns ------- Counter : Counter of scopes lengths (key) and their frequency (values). See Also -------- get_scopes Examples -------- >>> import numpy as np >>> scopes = structure(2, 3)[2] >>> scopes_size(scopes) == Counter({2: 4, 3: 4, 4: 1}) True """ return counter([len(scope) for scope in scopes.values()])
def calc_mode(list1): # max(set(list)),key=list.count() data = collections.counter(list1) datadic = dict(data) max_value = max(datadic.values()) mode = [num for num, freq in datadic if freq == max_value] return mode
def download_many(cc_list, base_url, verbose, concun_req): conter = collections.counter() with futures.ThreadPoolExecutor(max_workers=concun_req) as executor: #把 max_workers 设为 concur_req,创建 ThreadPoolExecutor 实例 to_do_map = {} #这个字典把各个 Future 实例(表示一次下载)映射到相应的国家代码上,在处理错误时使用 for cc in sorted(cc_list): future = executor.submit(download_one, cc, base_url, verbose) #每次调用 executor.submit 方法排定一个可调用对象的执行时间,然后返回一个 Future 实例 to_do_map[future] = cc #把返回的 future 和国家代码存储在字典中。 dont_iter = futures.as_completed(to_do_map) #futures.as_completed 函数返回一个迭代器,在期物运行结束后产出期物 if not verbose: dont_iter = tqdm.tqdm(dont_iter, total=len(cc_list)) #如果不是详细模式,把 as_completed 函数返回的结果传给 tqdm 函数,显示进度条 for future in dont_iter: #迭代运行结束后的期物 try: res = future.result() #在期物上调用 result 方法,要么返回可调用对象的返回值,要么抛出可调用的对象在执行过程中捕获的异常。这个方法可能会阻塞,等待确定结果 except requests.exceptions.HTTPError as exc: error_msg = 'HTTP {res.status_code} - {res.reason}' error_msg = error_msg.format(res=exc.response) except requests.exceptions.ConnectionError as exc: error_msg = 'Connection error' else: error_msg = '' status = res.status if error_msg: status = HTTPStatus.error counter[status] += 1 if verbose and error_msg: cc = to_do_map[future] #为了给错误消息提供上下文,以当前的 future 为键,从to_do_map 中获取国家代码。 print('*** Error for {}: {}'.format(cc, error_msg)) return counter
def glove(n, arr): n = counter(arr) return sum(i // 2 for i in n.values()) n = input() arr = list(map(int, input().split())) print(glove(n, arr))
def predict_labels(self, dists, k=1): """ Given a matrix of distances between test points and training points, predict a label for each test point. Inputs: - dists: A numpy array of shape (num_test, num_train) where dists[i, j] gives the distance between the ith test point and the jth training point. Returns: - y: A numpy array of shape (num_test,) containing predicted labels for the test data, where y[i] is the predicted label for the test point X[i]. """ num_test = dists.shape[0] y_pred = np.zeros(num_test) print 'start predicting labels' for i in xrange(num_test): print i # A list of length k storing the labels of the k nearest neighbors to # the ith test point. closest_y = self.y_train[np.argsort(dists[i])[:k]] # Naive way (just assign the 0-index of closest_y y_pred[i] = closest_y[0] # Better way, count which class appears most y_pred[i] = counter(closest_y).most_common(1)[0][0] return y_pred
def _filter_atomic_property(self, zeta_data, selected_atoms): zeta_data['atomic_number'] = zeta_data.index.labels[0] + 1 zeta_data['ion_number'] = zeta_data.index.labels[1] + 1 zeta_data = zeta_data[zeta_data.atomic_number.isin(selected_atoms)] zeta_data_check = counter(zeta_data.atomic_number.values) keys = np.array(zeta_data_check.keys()) values = np.array(zeta_data_check.values()) if np.alltrue(keys + 1 == values): return zeta_data else: # raise IncompleteAtomicData('zeta data') # This currently replaces missing zeta data with 1, which is necessary with # the present atomic data. Will replace with the error above when I have # complete atomic data. logger.warn('Zeta_data missing - replaced with 1s') updated_index = [] for atom in selected_atoms: for ion in range(1, atom + 2): updated_index.append([atom, ion]) updated_index = np.array(updated_index) updated_dataframe = pd.DataFrame(index=pd.MultiIndex.from_arrays( updated_index.transpose().astype(int)), columns=zeta_data.columns) for value in range(len(zeta_data)): updated_dataframe.ix[zeta_data.atomic_number.values[value]].ix[ zeta_data.ion_number.values[value]] = \ zeta_data.ix[zeta_data.atomic_number.values[value]].ix[ zeta_data.ion_number.values[value]] updated_dataframe = updated_dataframe.astype(float) updated_index = pd.DataFrame(updated_index) updated_dataframe['atomic_number'] = np.array(updated_index[0]) updated_dataframe['ion_number'] = np.array(updated_index[1]) updated_dataframe.fillna(1.0, inplace=True) return updated_dataframe
def query(self, point): cnt = collections.counter() for rep in self.reprs(point, self.tq): cnt[rep] += 1 for point2 in self.lists[rep]: yield point2 print(f'queried {sum(cnt.values())} buckets. {len(cnt)} unique.')
def _get_intents_and_slots(frame: Node, tree_based: bool) -> IntentsAndSlots: intents: Counter[Node] = counter() slots: Counter[Node] = counter() def process_node(node: Node, is_intent: bool) -> None: for child in node.children: process_node(child, not is_intent) if not tree_based: node = type(node)(node.label, deepcopy(node.span), text=node.text) if is_intent: intents[node] += 1 else: slots[node] += 1 process_node(frame, True) return IntentsAndSlots(intents=intents, slots=slots)
def get_expected_messages(stream): """Parses a file and get expected messages. :param stream: File-like input stream. :returns: A dict mapping line,msg-symbol tuples to the count on this line. """ messages = collections.counter() for i, line in enumerate(stream): match = _EXPECTED_RE.search(line) if match is None: continue line = match.group('line') if line is None: line = i + 1 elif line.startswith('+') or line.startswith('-'): line = i + 1 + int(line) else: line = int(line) version = match.group('version') op = match.group('op') if version: required = parse_python_version(version) if not _OPERATORS[op](sys.version_info, required): continue for msg_id in match.group('msgs').split(','): messages[line, msg_id.strip()] += 1 return messages
def get_sequence_weights(self): """ Return the calculated sequence weights for all sequences in the MSA. The order of weights in the array must be equal to the order of the sequences in the MSA. :return: Numpy array (dtype=numpy.float64) containing the weights for all sequences in the MSA. """ # weights = np.zeros(self.get_size()[0], dtype=np.float64) self.r_vals = np.zeros(self.get_size()[1], dtype=np.int64) cell_weight = np.zeros_like(self.sequences, dtype=np.float64) for i in range(self.get_size()[1]): column = self.sequences[:, i] # equivalent to np.take(self.sequences, i, axis=1) counts = counter(column) # inefficient way """ s_vals = np.array([counts[val] for val in column]) self.r_vals[i] = len(counts) for j in range(len(column)): cell_weight[j][i] = 1/(self.r_vals[i]*counts[self.sequences[j][i]]) """ # more efficient way s_vals = np.array(list(map(lambda key: counts[key], column)), dtype=np.int64) self.r_vals[i] = len(counts) if self.r_vals[i] > 1: # corresponds to S_i,k cell_weight[:, i] = np.divide( 1, np.multiply(self.r_vals[i], s_vals) ) # Or cell_weight[:, i] = 1/(self.r_vals[i]*s_vals[:]) weights = cell_weight.sum(1) # weights = [sum(x) for x in cell_weight] return weights.astype(np.float64)
def arrayToCdf(array): c = counter(array) k = sorted(c.keys()) total = sum(c.values()) v = [float(c[key]) for key in k] pdf = [x/total for x in v] cdf = np.cumsum(pdf) return [k, cdf]
def _get_received(self): messages = self._linter.reporter.messages messages.sort(key=lambda m: (m.line, m.symbol, m.msg)) received_msgs = collections.counter() received_output_lines = [] for msg in messages: received_msgs[msg.line, msg.symbol] += 1 received_output_lines.append(OutputLine.from_msg(msg)) return received_msgs, received_output_lines
def _check_same_line_imports(self, node): # Detect duplicate imports on the same line. names = (name for name, _ in node.names) counter = collections.counter(names) for name, count in counter.items(): if count > 1: self.add_message('reimported', node=node, args=(name, node.fromlineno))
def stats_text_en(text): import collections if not isinstance(text,str): # Day8 添加参数类型检查 raise ValueError('输入的不是文本格式,请重新输入:') text = text.replace('.', '').replace('!', '').replace('--', '').replace('*', '').replace(',', '').replace('(', '').replace(')', '').replace(';', '').replace(':', '').replace('\'', '').replace('?', '').replace('_', '').replace('-', '').replace('/', '') .replace('[', '') .replace(']', '') .replace('\\', '') .replace('\"', '').replace('{', '').replace('}', '').replace('\t', '').replace('\n', '').replace('\r\n', '') # 去除各种标点符号和空格 list_text = text.split() # 将string转换为list count = int(input("请输入要限制输出的元素个数:")) dic = collections.counter(list_text).most_common(count) return dic
def solve(arr): c = counter(arr) dicc_sorted = dict() print(c) for key in c.keys(): return 0 print(solve([1,2,3,0,5,0,1,6,8,8,6,9,1]))
def stats_text_en(text): #定义函数 import collections if not isinstance(text, str): raise ValueError('参数必须是 str 类型,输入类型 %s' % type(text)) text = text.replace(',', '').replace('.', '').replace('!', '').replace( '--', '').replace('*', '').replace('(', '').replace(')', '') list_text = text.split() count = int(input("请输入要限制输出的元素个数:")) dic = collections.counter(list_text).most_common(count) return dic
def minor_earned(self) -> Counter[Player]: """ Determine how much money all players indirectly earned in this exercise. Returns: The amount of money that every player is indirectly responsible for. """ earned = counter() for player, earning in it.product(self.alive, self.earned): earned[player] += earning.minor_earned(player) return earned
def update_filter( self, criteria: List[Criterion], operation: Operation, quantifier: str, ) -> None: """Update the selected programs and/or impart the associated taxa and/or mark them as hidden. Description: - If the operation is `"impart"`: - calculate the appropriate sets of programs and taxa; - remove these programs from `self.selected_programs`; - add these taxa to `self.imparted_knowledge`. - If the operation is `"hide"`: - calculate the appropriate sets of programs and taxa; - add these taxa to `self.hidden_taxa`. - Otherwise (the operation is either `"include"` or `"exclude"`): - calculate the appropriate bag of programs: this bag counts, for each program, the number of criteria they meet (maximum: size of `criteria`); - if `quantifier` is `"all"`, remove from this bag all programs which do not meet at least one criterion; - include or exclude the resulting programs. Note that the `"exclude"` operation extends to the programs which import the resulting ones: if the user wants to exclude a program, she obviously expects that the programs which require it are excluded too. Args: criteria (List[Criterion]): A list of criteria, i.e., a mix of regular expression patterns (strings) and/or predicates (triples). operation (Operation): Either `"impart"`, `"hide"`, `"include"` or `"exclude"`. quantifier (str): Either `"any"` or `"all"`. """ if operation in ("impart", "hide"): patterns = [str(criterion) for criterion in criteria] if operation == "impart": (program_set, taxon_set) = self.programs_and_taxa_of_patterns(patterns) self.exclude_programs(program_set, follow=False) self.impart_taxa(taxon_set) else: (program_set, taxon_set) = self.programs_or_taxa_of_patterns(patterns) self.hidden_programs.update(program_set) self.hidden_taxa.update(taxon_set) else: program_bag = self.programs_of_criteria( criteria, follow=(operation == "exclude")) if quantifier == "all": program_bag -= counter( {program: len(criteria) - 1 for program in program_bag}) if operation == "include": self.include_programs(set(program_bag)) else: # necessarily "exclude" self.exclude_programs(set(program_bag), follow=True)
def satisfiesF(L): """ Assumes L is a list of strings Assume function f is already defined for you and it maps a string to a Boolean Mutates L such that it contains all of the strings, s, originally in L such that f(s) returns True, and no other elements. Remaining elements in L should be in the same order. Returns the length of L after mutation """ data = counter(L) return data.most_common(L)
def main(): n = int(input().rstrip()) myset = list() for _ in range(0, n): myset.append(input().rstrip()) print(len(set(myset))) qty = list(counter(myset).values()) print(*qty, sep=" ")
def _filter_atomic_property(self, ionization_data, selected_atoms): ionization_data["atomic_number"] = ionization_data.index.labels[0] + 1 ionization_data["ion_number"] = ionization_data.index.labels[1] + 1 ionization_data = ionization_data[ionization_data.atomic_number.isin(selected_atoms)] ion_data_check = counter(ionization_data.atomic_number.values) keys = np.array(ion_data_check.keys()) values = np.array(ion_data_check.values()) if np.alltrue(keys == values): return ionization_data else: raise IncompleteAtomicData( "ionization data for the ion (" + str(keys[keys != values]) + str(values[keys != values]) + ")" )
def _filter_atomic_property(self, ionization_data, selected_atoms): ionization_data['atomic_number'] = ionization_data.index.labels[0] + 1 ionization_data['ion_number'] = ionization_data.index.labels[1] + 1 ionization_data = ionization_data[ionization_data.atomic_number.isin( selected_atoms)] ion_data_check = counter(ionization_data.atomic_number.values) keys = np.array(ion_data_check.keys()) values = np.array(ion_data_check.values()) if np.alltrue(keys == values): return ionization_data else: raise IncompleteAtomicData('ionization data for the ion (' + str(keys[keys != values]) + str(values[keys != values]) + ')')
def programs_of_criteria(self, criteria: List[Criterion], follow: bool) -> Counter[ProgramName]: """Calculate the set of programs that meet at least one of the criteria. Description: Each criterion may be either: - a string, which will be interpreted either as: - a program name pattern (ending with `".py"`). All programs matching it are accumulated in the result; - or a taxon name pattern. All programs featuring at least one taxon matching it are accumulated in the result. If the operation is `"exclude"`, this set is extended to the programs which import (either directly or by transitivity) at least one of its members; - a triple consisting in a “subject” pattern, a predicate (positive or negative) and an ”object” pattern. This predicate is normalized and, depending on its “sign”, evaluated on the patterns by either `ProgramFilter.programs_of_triple` or `ProgramFilter.programs_of_negated_triple`. Args: criteria (List[Criterion]): A list of criteria, i.e., a mix of regular expression patterns (strings) and/or predicates (triples). follow (bool): If `True`, extend the result with all the programs which import (either directly or by transitivity) at least one program meeting a criterion. Returns: Counter[ProgramName]: A bag (multiset) counting, for each resulting program, the number of criteria it meets. """ resulting_programs: Counter[ProgramName] = counter() for criterion in criteria: if isinstance(criterion, str): # the criterion is a pattern if criterion.endswith( ".py"): # the pattern is a program pattern programs = self.programs_of_pattern(criterion) else: # the pattern is a label pattern taxa = self.taxa_of_pattern(criterion) programs = self.programs_of_taxa(taxa, follow=follow) resulting_programs.update(programs) elif isinstance(criterion, (list, tuple)) and len(criterion) == 3: (pattern_1, raw_predicate, pattern_2) = criterion (predicate, negated) = normalize_predicate(raw_predicate) function = self.programs_of_negated_triple if negated else self.programs_of_triple resulting_programs.update( function(pattern_1, predicate, pattern_2)) else: print_warning( f"criterion {repr(criterion)} cannot be included or excluded." ) return resulting_programs
def spacy_tokenize_content(content, source_lang, setting): if source_lang == 'en': _spacy = spacy.load('en_core_web_sm') elif source_lang == 'de': _spacy = spacy.load('de_core_news_sm') elif source_lang == 'fr': _spacy = spacy.load('fr_core_news_sm') elif source_lang == 'es': _spacy = spacy.load('es_core_news_sm') spacy_content = _spacy(content) if setting == "to_words": token_set = [ token.lemma_ for token in spacy_content if len(token) >= 3 ] token_count = counter(token_set) elif setting == "to_sentences": token_set = [token.text for token in spacy_content.sents] token_count = counter(token_set) return token_count
def stats_textt_en(textt_en): # 统计每个英文单词出现的次数 # 第一步:过滤英文字符,并将text拆分为list # 第二步:清理*-等标点符号 # 第三步:使用collections库中的counter函数进行词频统计结果。 result = re.sub("[^A-Za-z]", "", textt_en.strip()) newList = result.split() x = 0 for x in range(0, len(newList)): newList[x] = newList[x].strip("*-,.?!") if newList[x] == "": newList[x].remove('') else: x += 1 print("英文单词次品统计结果:", collections.counter(newList), "\n")
def create_dictionary(clean_list): word_count = {} for word in clean_list: if word in word_count: word_count[word] += 1 else: word_count[word] = 1 for key, value in sorted(word_count.items(), key=operator.itemgetter(1)): print("% S : % S " % (key, value)) c = counter(word_count) top = c.most_common(10) print(top)
def majorityElement(nums): n = len(nums) # print(n) # x = nums.count(nums) myCounter = counter(nums) # print(myCounter.items()) halfN = n // 2 # print(halfN) majorElement = 0 for key, value in myCounter.items(): # print(key ,'->', value) if (value > halfN): majorElement = key # print('majorElement = ',majorElement) return majorElement
def parse(textfile, countfile): if os.path.isfile(countfile): print("{} exists".format(countfile)) return print("Loading {}".format(textfile)) with open(textfile, "r") as f: data = f.read() data = map(lambda c: (" ", c.lower())[int(c.isalpha())], data) data = "".join(list(data)).split() data = counter(data) print("Writing {}".format(countfile)) with open(countfile, "w") as f: print("word,count", file=f) for word, count in data.items(): print("{},{}".format(word, count), file=f)
def __fit_clusters(self, column: np.array) -> List[float]: """ Fit the clusters for a given feature. Arguments: column (np.array): All the values for a single feature. Returns: The cluster centers for this feature. """ column = np.sort(column) distinct_counter = counter(column) max_clusters = sum(min(count, self.__min_cluster_size) for count in distinct_counter.values()) // \ self.__min_cluster_size for num_clusters in range(max_clusters, 0, -1): clustering = KMeansConstrained(n_clusters = num_clusters, size_min = self.__min_cluster_size, random_state = self.__random_generator) clusters = clustering.fit_predict(column[:, np.newaxis]) if self.__correct_clustering(column, clusters): return self.__cluster_centers(column, clusters)
def train_classifier(classifier, cache, classifier_cache=None): """ Train classifier with word2vec feature :param classifier: classifier :param cache: new cache path :param classifier_cache: path of classifier cache """ if not classifier_cache: classifier_cache = get_classifier_cache(classifier) start_time = datetime.now() logger.info('Start training classifier: {}'.format(start_time)) area_codes, feat = load_cache(cache, __CACHE_KEY_AREA_CODES__, __CACHE_KEY_FEATURE__) area_counter = counter(area_codes) logger.info('Training with {} samples'.format(len(area_codes))) logger.info('Area count: \n{}\n{}'.format(area_counter.keys(), area_counter.values())) encode_path = get_encoder_cache(len(area_counter.keys())) if os.path.exists(encode_path): label_encoder = pickle.load(open(encode_path, 'rb')) logger.info('load label_encoder from {}'.format(encode_path)) else: label_encoder = LabelEncoder() label_encoder.fit(area_codes) pickle.dump(label_encoder, open(encode_path, 'wb')) logger.info('dump label encoder to {}'.format(encode_path)) labels = label_encoder.transform(area_codes) classifier.fit(feat, labels) joblib.dump(classifier, classifier_cache) logger.info('classifier dump to: {}'.format(classifier_cache)) end_time = datetime.now() logger.info('End training classifier: {}'.format(end_time)) logger.info('Time elapsed: {}s'.format( (end_time - start_time).total_seconds()))
for line in f: timeslot += 1 line = line.strip() line = line.split(' ') if len(line) == 1: userId = line[0].split('-')[1] usersLocations[userId] = [] runningLocation = None elif (line[1], line[2]) == runningLocation: pass else: runningLocation = (line[1], line[2]) usersLocations[userId].append(runningLocation) for k in usersLocations.keys(): usersLocations[k] = counter(usersLocations[k]) ############### Simulation Analysis Over ############### print usersLocations.keys()[0] print usersLocations.keys()[1] user1Locations = [] with open('user1.txt', 'r') as f: l = f.readline() l = l.strip() l = l.split(' ') totalDays = 0 print 'User 1:' print l[0] user1 = l[0]
from collections import Counter as counter flexOptions = [] urban = [] urbansize = [] urbrur = [] with open('../../Data/NHTS/Unused_PERV2PUB.CSV', 'r') as f: header = f.readline() header = header.strip() header = header.split(',') #print header.index('FLEXTIME') # 40 print header.index('URBAN') print header.index('URBRUR') print header.index('URBANSIZE') for line in f: line = line.strip() line = line.split(',') flexOptions.append(int(line[40])) urban.append(int(line[87])) urbansize.append(int(line[88])) urbrur.append(int(line[89])) c = counter(flexOptions) c1 = counter(urban) c2 = counter(urbansize) c3 = counter(urbrur) urbanFlexOptions = [flexOptions[i] for i in range(len(flexOptions)) if urbrur[i]==1]
def sets(player): values = [int(k[:-1]) for k in player] values_counter = counter(values).items() values_counter.sort(key = lambda values: values[1]) return values_counter
from urllib import urlopen from bs4 import BeautifulSoup site = urlopen("https://www.google.co.uk/finance").read() soup = BeautifulSoup(site) text = (soup.get_text()) from collections import counter cnt = counter(text) from urllib import urlopen from bs4 import BeautifulSoup site = urlopen("https://www.google.co.uk/finance").read() soup = BeautifulSoup(site) text = (soup.get_text()) from collections import counter cnt = counter(text)
for train_id in train: #check it against the info for every training instance train_instance = train[train_id] train_label = train_instance[0] labels.add(train_label) train_counts = train_instance[1] train_features = set(train_counts.keys()) shared = test_features.intersection(train_features) if similarity == 1: # Euclidian distance measure nearest.append((train_label, euclidian(train_counts, test_counts, train_features, test_features, shared, train_id, test_id))) else: # Cosine distance measure nearest.append((train_label, cosine(train_counts, test_counts, shared, train_id, test_id))) if similarity == 1: # If Euclidian nearest = sorted(nearest, key=itemgetter(1))[:int(k_val)] else: # If cosine nearest = sorted(nearest, key=itemgetter(1), reverse=True)[:int(k_val)] projected_label = counter(item[0] for item in nearest).most_common(1)[0][0] test_confusion_matrix[test_label][projected_label] += 1 sys_output.write("".join(["test:", str(instance_number), "\t", test_label, "\t"])) instance_number += 1 # print the labels and votes to the sys_output votes = defaultdict(int) for tup in nearest: votes[tup[0]] += 1 for c in labels: sys_output.write("\t"+c+"\t"+str(votes[c])) sys_output.write("\n") #print the confusion matrix print_matrix(test_confusion_matrix)
from decimal import * import csv from number import numberOf from collections import counter code=csv.reader(open('zipCodes.csv'),delimiter=',') bor= csv.reader(open('boroughs.csv'),delimiter=',') zipIncident= csv.reader(open('Incidents.csv'),delimiter=',') zipIncident.next () cities = [row[1] for row in zipIncident] freq=[] for (x,y) in counter (cities).iteritems(): freq.append((x,y)) zip_pop= [] code.next() for row in code: zip_pop.append ((row[1],row[10])) zip_boroughs=[] bor.next() for row in bor: zip_boroughs.append((row[0],row[1])) # incidents = dict(freq) population = dict(zip_pop) borough = dict(zip_boroughs)
user2ObsLocationCount.append([]) loc = (float(l[4]), float(l[5])) user2ObsLocationCount[-1].append(loc) user2ObsTripDistances.append(loc) runningDay = day runningTime = time totalDays += numdays if totalDays>=14: break user2ObsLocationCount = [len(set(x)) for x in user2ObsLocationCount] user2ObsTripDistances = getTripDistances(user2ObsTripDistances) # Plots for Number of locations visited c1_count = counter(user1ObsLocationCount) total = sum(c1_count.values()) for k in c1_count: c1_count[k] = float(c1_count[k])/total c1_simCount = counter(usersDailyLocationCount[user1]) total = sum(c1_simCount.values()) for k in c1_simCount: c1_simCount[k] = float(c1_simCount[k])/total c2_count = counter(user2ObsLocationCount) total = sum(c2_count.values()) for k in c2_count: c2_count[k] = float(c2_count[k])/total c2_simCount = counter(usersDailyLocationCount[user2])
plt.xlabel('Time of day (hrs)') plt.ylabel('Duration of activity (hrs)') plt.title('"Other" activity characteristics indicated by CDR') plt.savefig('3_Other') plt.close() # Categorize beginning timestamps of work in 10 minute windows and plot divisor = 1.0/6 bucketedTimestamps = [None for x in workBeginTimestamps] for i in range(len(workBeginTimestamps)): bucketedTimestamps[i] = workBeginTimestamps[i] - \ workBeginTimestamps[i]%divisor c = counter(bucketedTimestamps) x1 = c.keys() y1 = c.values() x = [a for (a,b) in sorted(zip(x1,y1))] y = [b for (a,b) in sorted(zip(x1,y1))] sumY = float(sum(y)) pOfY = [yy/sumY for yy in y] plt.plot(x, pOfY) plt.xlabel('Start Time [h]') plt.ylabel('Frequency') plt.xticks(range(0,25,6)) plt.xlim(0,24) plt.savefig('StartTimeFrequency')
stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) texts_final = [] for i in texts: for j in range(len(i) - 1): #print (i[j]) texts_final.append(i[j] + '_' + i[j+1]) #texts_final.append(text_temp) texts_final = counter(texts_final) dictWords = texts_final.most_common() texts_final = pd.DataFrame(dictWords) texts_final.to_csv('WordCount_gent.csv',encoding='utf-8') # In[23]: def char_ldamodel(reviewList): corpus = create_corpus(reviewList) ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=num_passes) return ldamodel
plt.xlabel('Time of day (hrs)') plt.ylabel('Duration of activity (hrs)') plt.title('"Work" activity characteristics indicated by MASS') plt.savefig('../../Figures/TSvsDur_MASS_Cont') plt.close() totalCount = 0 for c in counts: for x in c: totalCount+=x with open('../../ProcessedData/TSvsDur_MASS.txt', 'w') as f: for c in counts: l = [str(x/totalCount) for x in c] l = ' '.join(l) f.write(l+'\n') roundedStartTimes = [round(x) for x in arrivalTimes] c = counter(roundedStartTimes) total = len(arrivalTimes) count24 = c.pop(24) c[0] += count24 with open('../../ProcessedData/TSvsDurHrPerc_Mass.txt', 'w') as f: for k in sorted(c.keys()): f.write(str(k)+ ' ' + str(100*float(c[k])/total)+'\n')
for line in f: line = line.strip() line = line.split(' ') if line[2]=='1': numWorkers+=1 if line[9]=='1': numWorking+=1 startTime = float(line[10]) endTime = startTime+float(line[11]) startTimes.append(startTime) endTimes.append(endTime) startTimes = [int(x/6) for x in startTimes] endTimes = [int(x/6) for x in endTimes] c1 = counter(startTimes) c2 = counter(endTimes) x = range(24) y = [0] for t in x: working = y[-1]+c1[t]-c2[t] y.append(float(working)) y = y[1:] y = [yy/numWorking for yy in y] plt.plot(x,y) plt.xlabel('Time of day') plt.ylabel('Fraction of active workers') plt.xlim(0,24)
def isDrift(line): minStrains = 2 return sorted(counter(line).items(), key=lambda x: x[1], reverse=True)[1][1] <= minStrains #this last value is the number of required strains + 1
allWorkersWorkDays.extend(workDays) workDays = [] else: continue if workerBeingProcessed == 1: workerBeingProcessed = 0 workersProcessed += 1 worker = workers[workersProcessed] workDays = list(set(workDays)) allWorkersWorkDays.extend(workDays) workDays = [] print workersProcessed c = counter(allWorkersWorkDays) k = [x for (x, y) in sorted(zip(c.keys(), c.values()))] v = [y for (x, y) in sorted(zip(c.keys(), c.values()))] d = [x.weekday() for x in k] for i in range(len(d)): if d[i] == 0: d[i] = "M" elif d[i] == 1: d[i] = "T" elif d[i] == 2: d[i] = "W" elif d[i] == 3: d[i] = "T" elif d[i] == 4: d[i] = "F"
def readGenome(filename): genome = '' with open(filename, 'r') as f: #opened as read only for line in f: if not line[0] == '>': genome += line.rstrip() return genome genome = readGenome('lambda_virus.fa') counts = {'A':0, 'C':0, 'G':0, 'T':0, 'N':0} for base in genome: counts[base] +=1 print (counts) import collections collections.counter(genome) !wget --no-check https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/SRR835775_1.first1000.fastq def readFastq(filename): sequences = [] qualities = [] with open(filename) as fh: while True: fh.readline() seq = fh.readline().rstrip() fh.readline() qual = fh.readline().rstrip() if len(seq) == 0: break sequences.append(seq) qualities.append(qual)
# initialize and load in data grammar = defaultdict(list) train = open(train).read().strip().split("\n") sentences = open(sentences, 'r').read().strip().split("\n") for s in train: for rule in Tree(s).productions(): grammar[rule.lhs()].append(rule.rhs()) # create pcfg the key is the lhs of the rule, # the value is a dictionary where the key is a tuple of the RHS # and the value is the prob for that RHS pcfg = defaultdict(dict) for left in grammar: for k, v in counter(grammar[left]).most_common(): pcfg[left][k] = v/float(len(grammar[left])) inversePCFG = defaultdict(list) for key in pcfg: for value in pcfg[key]: inversePCFG[value].append((key, pcfg[key][value])) #output trained grammar grammar_out = open(grammar_out, 'w') for left in pcfg: for right in pcfg[left]: grammar_out.write(" ".join([str(left), "->", " ".join([str(item) for item in right]), "["+str(pcfg[left][right])+"]", "\n"])) grammar_out.close()
from sklearn.cluster import DBSCAN from sklearn import metrics from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler data = np.array(X) db = DBSCAN(eps=0.25, min_samples=80).fit(data) core_samples = db.core_sample_indices_ labels = db.labels_ numClusters = len(set(labels)) - (1 if -1 in labels else 0) previousClusterCount = numClusters clusters = [labels == k for k in xrange(numClusters)] from collections import Counter as counter print counter(labels) def getDbscanColor(label): if label=='2': return 'g' elif label=='0': return 'b' elif label=='1': return 'r' elif label=='-1': return 'k' elif label=='3': return 'orange' elif label=='4': return 'magenta'
m = startTime%100 h = startTime/100 startTime = h + float(m)/60 dwellTime = float(line[89])/60 if dwellTime>=0 and startTime>0: # Considering work longer than 2 hour dwellTimesHbo.append(dwellTime) startTimesHbo.append(startTime) f.close() startTimesNhb = [int(x) for x in startTimesNhb] startTimesHbo = [int(x) for x in startTimesHbo] startTimesHbw = [int(x) for x in startTimesHbw] hourlyCountNhb = counter(startTimesNhb) hourlyCountHbo = counter(startTimesHbo) hourlyCountHbw = counter(startTimesHbw) plt.xlim(0,24) x1 = sorted(hourlyCountNhb.keys()) y1 = [hourlyCountNhb[x] for x in sorted(hourlyCountNhb.keys())] x2 = sorted(hourlyCountHbo.keys()) y2 = [hourlyCountHbo[x] for x in sorted(hourlyCountHbo.keys())] x3 = sorted(hourlyCountHbw.keys()) y3 = [hourlyCountHbw[x] for x in sorted(hourlyCountHbw.keys())] plt.plot(x1, y1, color='b', marker='o', label='NHB') plt.plot(x2, y2, color='r', marker='o', label='HBO')
for t in tractInfo: for i in range(len(timeIntervals)): if t[1] <= timeIntervals[i][0] and t[2] >= timeIntervals[i][1]: occupancy[i] += 1 if t[0] == "h": if t[1] <= timeIntervals[i][0] and t[2] >= timeIntervals[i][1]: occupancyHome[i] += 1 elif t[0] == "w": if t[1] <= timeIntervals[i][0] and t[2] >= timeIntervals[i][1]: occupancyWork[i] += 1 elif t[0] == "o": if t[1] <= timeIntervals[i][0] and t[2] >= timeIntervals[i][1]: occupancyOther[i] += 1 occupantType = [t[0] for t in tractInfo] c = counter(occupantType) xCoords = [t[0] for t in timeIntervals] plt.plot(xCoords, occupancy, color="b", label="Total") plt.plot(xCoords, occupancyHome, color="r", label="Home") plt.plot(xCoords, occupancyWork, color="g", label="Work") plt.plot(xCoords, occupancyOther, color="orange", label="Other") plt.legend(loc="lower left", prop={"size": 11}) plt.xlabel("Time of day (hours)") plt.ylabel("Tract Occupancy") plt.title("Occupancy profile for tract " + TRACT) plt.savefig(TRACT) plt.close()
[c, l] = tractColorAndLabel(tractInfo[tract])#Info[tract]/maxExpFactor x = tractShape[tract][0] y = tractShape[tract][1] plt.fill(x,y,color=str(c), label=l if l not in addedLabels else '') found+=1 plottedExpFactors.append(tractInfo[tract]) if l not in addedLabels: addedLabels.append(l) except: notFound+=1 plt.legend(loc='lower left', prop={'size':11}) plt.savefig('censusTractExpansionFactors', dpi=500) plt.close() expansionFactors = tractInfo.values() #int5ExpansionFactors = [x-x%5 for x in expansionFactors if x<250] int5ExpansionFactors = [x-x%5 for x in plottedExpFactors if x<250 and x>0] c = counter(int5ExpansionFactors) k = c.keys() v = c.values() x = [X for (X,Y) in sorted(zip(k,v))] y = [Y for (X,Y) in sorted(zip(k,v))] total = sum(y) y = [float(a)/total for a in y] plt.plot(x, y, marker='o') plt.xlabel('Expansion factor, f') plt.ylabel('P(f)') plt.savefig('pdfExpansionFactors') plt.close()