def get_common_words(doc, num): words = [ cleanup(token.text) for token in doc if token.is_stop != True and token.is_punct != True and '\n' not in token.text and not any(c in token.text for c in ['â', '—', '¦', '|', 'ï', '\n', ' ']) ] return cntr(words), ' '.join([x[0] for x in cntr(words).most_common(num)])
def tokenize_me(input_text): soup = BeautifulSoup(input_text, features="html.parser") text = soup.get_text() tokens = nltk.word_tokenize(text.lower()) tokens = [i for i in tokens if (i not in punctuation)] normalized_tokens = [] for t in tokens: t = t.strip(string.punctuation) if len(t) > 1: new_token = morph.parse(t)[0] if new_token.tag.POS is None: if str(new_token.tag) in ['UNKN', 'LATN']: normalized_tokens.append(new_token.normal_form) elif new_token.tag.POS in ['NOUN', 'VERB', 'INFN', 'ADJF', 'ADJS']: normalized_tokens.append(new_token.normal_form) tokens = [t for t in normalized_tokens if t not in stop_words] tokens = set(tokens) bigrams = ngrams(tokens, 2) for k1, k2 in cntr(bigrams): if k1 < k2: tokens.add(k1 + " " + k2) else: tokens.add(k2 + " " + k1) return tokens
def get_common_nouns(doc, num): nouns = [ cleanup(token.text) for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN" and not any(c in token.text for c in ['â', '—', '¦', '|', 'ï', '\n', ' ']) ] return ' '.join([x[0] for x in cntr(nouns).most_common(num)])
def problem_64(): from collections import Counter as cntr _ = input() c = cntr(list(map(int,input().split()))) ans = 0 for k,v in c.items(): if k<=v: ans+=v-k else: ans+=v print(ans)
def allRepeatedMotifs(dna, minimum, maximum): # insure that DNA in upper cases dna = dna.upper() maximum = maximum # longest motif with at less two repeats minimum = minimum # the minimum length of motif is 9 bp start = timeit.default_timer() substrs = cntr() for n in range(minimum,maximum): # look from 9 to 41 bp for i in range(0,len(dna)-n): # loop for counter substrs[dna[i:i+n]] += 1 motifs = substrs#.most_common stop = timeit.default_timer() print('> Get all repeated substring: ', round(stop - start, 3), "s") # Convert to dict d = dict(motifs) start = timeit.default_timer() # filter only repeats at less twice # sort by motif length decreasing d = dict((k, v) for k, v in sorted(d.items(), key=lambda item: len(item[0]),reverse = True) if v >= 2) # Sort by value decreasing #d = dict((k, v) for k, v in sorted(d.items(), key=lambda item: item[1],reverse = True) if v >= 2) stop = timeit.default_timer() print('> Filter and sort motifs with at less two repeats: ', round(stop - start, 3), "s") start = timeit.default_timer() ls = [] # reformat output #for key, value in d.items(): # print(key, len(str(key)) ,value) for key, value in d.items(): temp = [key,len(str(key))] ls.append(temp) stop = timeit.default_timer() print('> Reformating dictionary to list, include length of motifs (bp): ', round(stop - start,3), "s") print("> At all there are: ", len(ls), " motifs, with at less two repeats.") #print("> Print motifs, length , repeats: ", *ls[1:20], sep = "\n") return(ls)
def calculateBasins(grid): n, basins = len(grid), [u for u in range(len(grid)**2)] # disjoint-set-union help methods def find(u): return u if basins[u] == u else find(basins[u]) def union(u, v): basins[find(v)] = find(u) # union edges using disjoint-set-union for i in range(n): for j in range(n): mini, minj = get_min(grid, i, j, n) union(i * n + j, mini * n + minj) # count and sort return sorted(cntr(find(u) for u in range(n * n)).values(), reverse=True)
def split_on_best_feature(self, dataX, dataY): # Resource: Primary Resource -> No. 2 # Main Goal: Choosing the best feature to split on <- that means choosing the feature which has the highest absoluate correlation with dataY. # Rules: # 1. Splitting value will be the mean of the splitiing feature values. # 1a. If all the features have same amount of values, then choose the feature which comes first. # 2. If the selected best feature can not split the data, then we will choose second best feature to split on. # 2a. If none of the feature can not split the data accordingly, then it that case we will return the leaf. # Params: # dataX : A numpy ndarray -> x values at each node # dataY : A numpy 1d array -> y values at each node # Returns: # Tree: A mumpy ndarray. # # <------ feature indices (int type; index for a leaf is -1), splitting values ------> # / ~ ~ ~ ~ # | ~ ~ ~ ~ # nodes | ~ ~ ~ ~ # | ~ ~ ~ ~ # \ ~ ~ ~ ~ if dataX.shape[0] <= self.leaf_size: return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan]) # Now, lets look into the availble list of features. availble_features = range(dataX.shape[1]) # Equivalent to num_features availble_LIST_of_features = list(availble_features) # Tuples: (<features>, <their_correlation_with_dataY>) feature_correlations = [] feature_correlations = sorted(feature_correlations, key=lambda feature_correlations: feature_correlations[1]) # Sorting with correlations. # Referance for Sorting: https://docs.python.org/2.7/howto/sorting.html for ftr_itr in range(dataX.shape[1]): absolute_correlation_value = abs(pr(dataX[:, ftr_itr], dataY)[0]) # Dropping NAN values, and assigning their correlation to 0.0 <- float number. if np.isnan(absolute_correlation_value): absolute_correlation_value = 0.0 else: pass # Now,Appending all values to features_correlaticat coons. feature_correlations.append((ftr_itr, absolute_correlation_value)) # Choosing the best feature. # if lenth of availble total features are 0, # then return leaf. feature_Correlation_temp = 0 if len(availble_LIST_of_features) == 0: return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan]) #else: # once again check if the features are 1 or more. # Choose the best feature, if any, by iterating over feats_corrs else: # Choose the best feature, if any, by iterating over feats_corrs while len(availble_LIST_of_features) -1 >= 0: best_feature_itr = feature_correlations[feature_Correlation_temp][0] y = best_feature_itr # Split the data according to the best feature, and considering the mean of the data. # Primary Resource No. 2 split_val = np.median(dataX[:, y]) # Arrays for indexing - Logically left_i = dataX[:, y] right_i = dataX[:, y] left_index = left_i <= split_val right_index = right_i > split_val # In any case if we can not split ANY feature in any two distinct parts, then all we do is -> return the leaf. if len(np.unique(left_index)) != 1: break # Once we use the feature, then we take it off from remaining best features to choose from. availble_LIST_of_features.remove(y) feature_Correlation_temp = feature_Correlation_temp + 1 # Once we run while loop and in any case if we run out of all features that we can split on, then in that case we just return leaf. if len(availble_LIST_of_features) == 0: return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan]) # Building Following: # left branch # the root lefttree = (self.split_on_best_feature(dataX[left_index], dataY[left_index])) # Set the starting row for the right subtree of the current root if lefttree.ndim == 1: # https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.ndarray.ndim.html righttree_start = 1 righttree_start = righttree_start + 1 elif lefttree.ndim >= (1+1): righttree_start = (lefttree.shape[0] + 2)-1 root = np.array([best_feature_itr, split_val, 1, righttree_start]) return np.vstack((root, lefttree, self.split_on_best_feature(dataX[right_index], dataY[right_index])))
errlist = list(gatelog_ids - outdir_ids) gatelogs_werr = [] for x in errlist: #print x gatelogs_werr.extend(glob.glob("./**/gatejob*" + str(x))) print("--------------Joblogs with errors----------------") print("joblogs without matching outputdir:") for logf in gatelogs_werr: print(logf) with open(logf) as f: preverr = "" for line in f: if re.search(w, line, re.IGNORECASE) and preverr != line: preverr = line print(line, end=' ') print("-------------------------------------------------") run_ids = [] if len(gatelogs_werr[0].split('/')) > 2: for logf in gatelogs_werr: run_ids.append(logf.split('/')[-2]) for key, val in list(cntr(run_ids).items()): print("Relaunch", val, "jobs in", key) else: print("Relaunch", len(gatelogs) - len(outdirs), "jobs.")
outdir_ids = set([int(x.split('/')[-1].split('.')[-1]) for x in outdirs]) errlist = list(gatelog_ids - outdir_ids) gatelogs_werr = [] for x in errlist: #print x gatelogs_werr.extend(glob.glob("./**/gatejob*" + str(x))) print "--------------Joblogs with errors----------------" print "joblogs without matching outputdir:" for logf in gatelogs_werr: print logf with open(logf) as f: preverr = "" for line in f: if re.search(w, line, re.IGNORECASE) and preverr != line: preverr = line print line, print "-------------------------------------------------" run_ids = [] if len(gatelogs_werr[0].split('/')) > 2: for logf in gatelogs_werr: run_ids.append(logf.split('/')[-2]) for key, val in cntr(run_ids).items(): print "Relaunch", val, "jobs in", key else: print "Relaunch", len(gatelogs) - len(outdirs), "jobs."