def main(args): df = pd.read_csv(args.metadatafile) if not os.path.exists(args.outputdir): os.makedirs(args.outputdir) # drop rows that describe metadata sheets only (no Code) df.dropna(subset=["Code"], inplace=True) # remove the rows where Conditions are undefined undef = "undefined" df = df.query("Conditions != @undef") # make a dictionary to store, for each code, the list of # filenames for each replicate codemap = dd(lambda: dd(list)) for row_index, row in df.iterrows(): filename = row["Filename"] code = row["Code"] # extract the replicate and then remove to get code only pattern = re.compile("-(\d+)") m = re.search(pattern, code) replicate = m.group(1) codeonly = re.sub(pattern, "", code) # print filename,code,codeonly,replicate codemap[codeonly][replicate].append(filename) for code, replist in codemap.items(): concat(code, replist)
def __init__(self): self.data = [] self.tag = [] self.ngram = 4 self.types = dd(int) self.frameType = 'fre' self.frames = dd(lambda: dd(int)) self.threshold = 45 #remove frames lower < threshold (default top45) self.tokenCount = 0 self.utteranceCount = 0 self.debug = 1 self.filterFrameCount = 0 self.filterUtterBound = False self.removeUtterBound = False self.filterTokenTags = False self.quiet = True # self.tagMap={'n':'n','pro':'n','adj':'adj', 'adv':'adv','conj':'conj',\ # 'det':'det', 'qn':'det', 'prep':'prep', 'v':'v', 'aux':'v',\ # 'part':'v', 'mod':'v', 'neg':'neg', \ # 'co':'int', 'int':'int', 'wh':'wh'} self.tagMap={'n':'n', 'pro':'n', 'adj':'adj', 'adv':'adv', 'conj':'conj', 'det':'det',\ 'prep':'prep', 'v':'v', 'part':'v', 'mod':'v', 'aux':'v','int':'int',\ 'neg':'neg', 'wh':'wh'} #### regs ### self.regUtterBound = re.compile('^P_') self.regWordTag = re.compile('^(.*?)\/(.*?):') self.regWhTag = re.compile('^.*?\/.*?:wh:') self.regNoTag = re.compile('^(.*?)\/_?$')
def takeQueries(self): while 1: word = raw_input("Enter a word:") if word == 'exit': break print "CK func return value:", self.isCK(word) print "xSub func return value:", self.xSub(word, dd(set), dd(set), 0)
def compute_gradient(self, generative_model): gradient = dd(float) recprob = np.exp(self.partition) / np.sum(np.exp(self.partition)) jointprob = np.copy(recprob) for latent_states in itertools.product(self.latent_states, repeat=4): for i in range(4): jointprob[latent_states] *= generative_model.px_given_z(self.sentence[i], latent_states[i]) jointprob /= np.sum(jointprob) # Calculate feature expectation over q(z|x) feat_exp = dd(float) for latent_states in itertools.product(self.latent_states, repeat=4): for fcn in self.feature_functions()[1]: for i in range(4): feat_exp[fcn(i, self.sentence, latent_states[i])] += recprob[latent_states] for fcn in self.feature_functions()[3]: feat_exp[fcn(0, self.sentence, *latent_states[0:3])] += recprob[latent_states] feat_exp[fcn(1, self.sentence, *latent_states[1:4])] += recprob[latent_states] # Calculate the gradient for latent_states in itertools.product(self.latent_states, repeat=4): #print latent_states, jointprob[latent_states] temp = dd(float) for fcn in self.feature_functions()[1]: for i in range(4): temp[fcn(i, self.sentence, latent_states[i])] += 1 for fcn in self.feature_functions()[3]: temp[fcn(0, self.sentence, *latent_states[0:3])] += 1 temp[fcn(1, self.sentence, *latent_states[1:4])] += 1 for feature in feat_exp: gradient[feature] += jointprob[latent_states] * (temp[feature] - feat_exp[feature]) return gradient
def __init__(self, text): super(Word, self).__init__() self.text = text self.freq = 0.0 self.left = dd(int) self.right = dd(int) self.aggreg = 0.0
def __init__(self,area): self.initiallat = area[3][0] self.initiallon = area[3][1] self.dist_range = 5.0 #nm self.alt_range = 1000.0 #ft self.t_cpa = 0 self.dist_cpa = 0 self.spd = np.array([]) self.lat = np.array([]) self.lon = np.array([]) self.pos = np.array([]) self.trk = 0 self.alt_dif = 0 self.alt = 0 self.id = [] self.complexity = dd(lambda:dd(int)) self.rel_trk = np.array([]) self.step = -1 self.id_previous = [] self.headings = [] self.headings_previous = np.array([]) self.doubleconflict = 0 self.ntraf = 0 self.compl_ac = 0 self.time_lookahead = 1800 #seconds self.selected_area = ([area[0][0],area[0][1]],[area[1][0],area[1][1]],[area[2][0],area[2][1]],[area[3][0],area[3][1]]) return
def __init__(self): self.posts = [] self.userwiseThreads = dd(set) self.userwisePosts = dd(set) # Stores indices self.userStart = dd(lambda:5000) self.maxDay = -1 self.userWeekwisePosts = dd(lambda:dd(list)) self.userWeekwiseAccusations = dd(lambda:dd(set)) self.fakeUsers = set() # Stores the postId of the previous fake annotation we did self.fakeUsersPosts = {} self.postIdMap = {} self.nonFakeUsers = set() self.twitterLexicon = set() self.pkWords = set(["napkin", "pumpkin", "pk", "upkeep"]) self.kcWords = set(["kc", "backcast", "backcloth", "blackcock", "blackcurrant", "bookcase", "cockchafer", "dickcissel", "kekchi", "kinkcough", "lockchester", "markcourt", "neckcloth", "packcloth", "sackcloth"]) self.bkWords = set(["bk", "abk", "ebk", "bks", "abks", "ebks", "bkz", "abkz", "ebkz"]) self.addPlurals(self.pkWords) self.pkWords.add("pk's") self.ccWords = set() self.ckWords = set() self.loadLexiconForCC() self.loadLexiconForCK() self.loadTwitterLexicon("") self.addAAESuffEnds(self.twitterLexicon) self.addXtreme(self.twitterLexicon) self.addPlurals(self.twitterLexicon) self.features = set(["cc", "ck", "bk", "pk", "hk", "oe", "3", "5", "6", "x", 'nword', 'hood', 'bCaret', 'cCaret', 'pCaret', 'hCaret']) self.activeForums = {} self.wordsNotConsideredLater = dd(int) self.wordsConsidered = dd(lambda:dd(int)) self.consideredWordsCount = 0
def load_key(fname): print >> sys.stderr, "loading %s" % fname d = dd(lambda: dd(lambda: dd(lambda : 0.))) lines = open(fname).readlines() #c = 0 for line in lines: line = line.split() key, inst = line[:2] senses = line[2:] senses = [sense.split('/') for sense in senses] if len(senses) == 1: #c += 1 d[key][inst][senses[0][0]] = 1. else: uni = [] for sense in senses: if len(sense) == 1: uni.append(sense) else: d[key][inst][sense[0]] = float(sense[1]) if len(uni) > 0: assert len(uni) != len(senses), "Some sense weighted, some not: %s" % inst val = 1. / len(uni) for sense in senses: d[key][inst][sense[0]] = val return d
def solve(): global A, motes, states motes.sort() states = [dd(int), dd(int)] old, new = 0, 1 # state[idx, ops] = size states[new][0] = A shortcuts = set() for idx in range(0, len(motes)): new, old = old, new states[new] = dd(int) for ops in states[old]: size = states[old][ops] if size > motes[idx]: states[new][ops] = max(states[new][ops], size + motes[idx]) elif size <= motes[idx]: # eat new particles if size != 1: tmp_size = size tmp_ops = 0 while tmp_size <= motes[idx]: tmp_size += tmp_size - 1 tmp_ops += 1 states[new][ops+tmp_ops] = max(states[new][ops+tmp_ops], tmp_size + motes[idx]) # eliminate next particle # states[new][ops+1] = max(states[new][ops+1], size) shortcuts.add(ops + len(motes) - idx) if len(shortcuts) and len(states[new]): return min(min(states[new].keys()), min(shortcuts)) return min(shortcuts) if len(shortcuts) else min(states[new].keys())
def fetch_pos(): pos_id = dd(lambda: dd()) for r in query_omw("""SELECT id, tag, def FROM pos"""): pos_id['id'][r['id']]=r['tag'] pos_id['tag'][r['tag']]=r['id'] pos_id['def'][r['id']]=r['def'] return pos_id
def gen_label_graph(self): labels, label2inst, not_label = [], dd(list), dd(list) for i in range(self.x.shape[0]): flag = False for j in range(self.y.shape[1]): if self.y[i, j] == 1 and not flag: labels.append(j) label2inst[j].append(i) flag = True elif self.y[i, j] == 0: not_label[j].append(i) while True: g, gy = [], [] for _ in range(self.g_sample_size): x1 = random.randint(0, self.x.shape[0] - 1) label = labels[x1] if len(label2inst) == 1: continue x2 = random.choice(label2inst[label]) g.append([x1, x2]) gy.append(1.0) for _ in range(self.neg_samp): g.append([x1, random.choice(not_label[label])]) gy.append( - 1.0) yield np.array(g, dtype = np.int32), np.array(gy, dtype = np.float32)
def get_file_dict(fns,header_prefix='') : file_map = dd(lambda: dd(list)) out_fieldnames = [] blank_entry = [] for fn in fns : max_maps = 0 f = reader(open(fn),delimiter='\t') #f = open(fn) fieldnames = f.next() fieldnames = fieldnames[2:] # we don't want existing knownGeneID or geneSymbol # read in the data, create a dictionary for l in f : if opts.symbols : gene, symbol, data = l[0],l[1],l[2:] symbol_map[gene] = symbol else : gene, data = l.split('\t',1) file_map[fn][gene].append(data) max_maps = max(max_maps,len(file_map[fn][gene])) all_genes.add(gene) # if we're adding a binary column, do it if opts.binary_plus : out_fieldnames.append(header_prefix+fn+'.MAPPED') # construct the fieldnames for this file for i in range(max_maps) : out_fieldnames.extend(['%s%s.%d.%s'%(header_prefix,fn,i,h) for h in fieldnames]) # pad out data entries w/ fewer than max_maps for gene,data in file_map[fn].items() : while len(data) < max_maps : data.append(['']*len(fieldnames)) file_map[fn]['blank'] = [['']*len(fieldnames) for _ in range(max_maps)] return file_map,out_fieldnames
def gridIllumination(self, N: int, lamps: List[List[int]], queries: List[List[int]]) -> List[int]: result = [] row = dd(int) col = dd(int) ru_ld = dd(int) lu_rd = dd(int) lamp_set = set() for lamp in lamps: x, y = lamp row[x] += 1 col[y] += 1 ru_ld[x-y] += 1 lu_rd[x+y] += 1 lamp_set.add((x, y)) dx = [-1, -1, -1, 0, 0, 0, 1, 1, 1] dy = [-1, 0, 1, -1, 0, 1, -1, 0, 1] for query in queries: x, y = query if row[x] > 0 or col[y] > 0 or ru_ld[x-y] > 0 or lu_rd[x+y] > 0: result.append(1) else: result.append(0) for p, q in zip(dx, dy): nx = p + x ny = q + y if nx >= 0 and nx < N and ny >= 0 and ny < N and (nx, ny) in lamp_set: row[nx] -= 1 col[ny] -= 1 ru_ld[nx - ny] -= 1 lu_rd[nx - ny] -= 1 lamp_set.remove((nx, ny)) return result
def predict_by_chunks(self): predictions = dd(dict) # optimization self.optimize() self.logger.info(self.clf_wrapper.classifier) for tw, chunks in self.dataset.viewitems(): tw_predictions = dd(list) #tw_scores = [] for i, datasets in enumerate(chunks): #test set and train set tr, te = datasets X_train, X_test, y_train, y_test, test_inst_order = self._prepare(tw,tr,te) #score = 0.0 try: self.clf_wrapper.classifier.fit(X_train, y_train) prediction = self.clf_wrapper.classifier.predict(X_test) #score = self.clf_wrapper.classifier.score(x_test, y_test) except ValueError, e: # all instances are belongs to the same class self.logger.warning("{}-{}: {}".format(self.clf_wrapper.name, tr, e)) if str(e) == "The number of classes has to be greater than one.": print >> sys.stderr, "initialization", y_train[0] prediction = [y_train[0]] * len(y_test) #score = sum(prediction == y_test) / float(len(y_test)) if str(e) == "Input X must be non-negative.": pass tw_predictions[i].extend(zip(test_inst_order, prediction)) for i in xrange(len(chunks)): predictions[i][tw] = dict(tw_predictions[i])
def initialize_data(self, elicited_features): # sets term_indices (per language, languages (set of languages, nS # (number of situations) as wel as CMs (count matrices for every # language; { language : nS x nT_language } #fn = 'data/%s/elicited_features.csv' % self.data_folder #with open(fn, 'r') as fh: # self.elicited_features = list(csv.reader(fh)) self.elicited_features = elicited_features #self.filter_data() #YM: Filters the data to eliminate any situations occurring only in one of the target languages. self.term_indices = dd(lambda : {}) self.languages = set() self.nS = 0 CM_constructor = dd(lambda : dd(lambda : dd(float))) # for language, subject, situation, word in self.elicited_features: self.languages.add(language) self.nS = np.max([self.nS, int(situation)+1]) try: word_ix = self.term_indices[language][word] except KeyError: lt = len(self.term_indices[language]) word_ix = self.term_indices[language][word] = lt CM_constructor[language][int(situation)][word_ix] += 1.0 self.CMs = {language : np.zeros((self.nS, len(self.term_indices[language]))) for language in self.languages} for language, v1 in CM_constructor.items(): for situation, v2 in v1.items(): for term, count in v2.items(): self.CMs[language][situation,term] = count return
def __init__(self, *args, **kwargs): """Constructor for the corpus object. Takes a list of documents, and constructs the corpus. """ self.__docids = set() self.__entities = dd(int) self.__entity_index = dd(list)
def __init__(self,files,keys,ktype,prefix): #,strictLevel): self.METHOD="UNIQUE_SENSE" if ktype == 'binary': self.groupDict = {} self.flist = [] for line in open(keys): line=line.split() if line[1] == '0' or line[1] == '1': self.groupDict[line[0]]=int(line[1]) self.flist.append(line[0]) self.keyType = 'binary' else: sys.stderr.write("Warning: Non-binary key supplied - assuming continuous variable\n") self.keyType = 'continuous' self.groupDict[line[0]] = float(line[1]) self.flist.append(line[0]) self.cnt_dict = dd(lambda: dd(lambda: [0,0,0,0])) self.total_cnts = dd(float) for f in files: if f not in self.flist: print "ERROR XYZ" sys.exit() for line in open(f): line = line.split() self.cnt_dict[line[0]][f] = [float(line[1]),float(line[2]),float(line[3]),float(line[4])] self.total_cnts[f] = sum(self.cnt_dict[line[0]][f]) self.prefix = prefix
def __init__(self): self._userWise = dd(list) self._tweets = [] self._scores = [] self._correct = [] self._mean = dd(float) self._variance = dd(float)
def __init__(self): sys.stderr.write("Tagger: In Constructor\n") ## Data containers self.__classes = [] self.__train = [] self.__test = [] self.__output = [] self.__accuracies = [] ## Tagger Options and Settings self.__workDir = "/tmp" self.__tntTrain = "tnt-para" self.__trainOptions = "" self.__testOptions = "-v3 -m" self.__tntTest = "tnt" self.__trainFile = self.__workDir + "/" + "train" self.__testFile = self.__workDir + "/" + "test" self.__modelFile = self.__workDir + "/" + "model" ## Accuracy Stuff self.__tags = dd(lambda: dd(int)) self.__sameLangContext = dd(lambda: dd(int)) self.__diffLangContext = dd(lambda: dd(int)) self.__prevWordDiffContext = dd(lambda: dd(int)) self.__preprevWordDiffContext = dd(lambda: dd(int)) self.__unknownWords = 0 self.__totalCorrect = 0 self.__totalWords = 0 self.__totalSents = 0 self.__correctSents = 0
def main(): parser = ap.ArgumentParser() parser.add_argument('eids') args = parser.parse_args() improvements = dd(lambda: dd(list)) for fn, args, duration, result in rtk.dist.db.iter_results(args.eids): for r in result: for k, v in r.items(): improvements[k][args['rand_seed']].append(v) for k in improvements.keys(): all_results = np.vstack(improvements[k].values()) assert all_results.shape == (len(improvements[k]), len(improvements[k].values()[0])) improvements[k] = (all_results.mean(axis=0), all_results.std(axis=0)) x = range(1, improvements.iteritems().next()[1][0].shape[0]+1) for k, (mean, err) in improvements.iteritems(): if 'vae' in k: continue plt.errorbar(x, mean, yerr=err, label=k) plt.axvline(x=5, color='purple') plt.axhline(y=0.25, color='green') plt.legend(loc='center right') plt.title('Performance Metrics for CRF Semi-supervised') plt.xlabel('Iteration #') plt.savefig('crf.pdf')
def compilePairs(self): tmpPairs = []; self.endTable = dd(list); self.startTable=dd(list) for p in self.knownJxns: tmpPairs += [(p[i-1][1],p[i][0]) for i in range(1,len(p))] self.validPairs = sorted([t for t in set(tmpPairs)]) for t in self.validPairs: self.endTable[t[0]].append(t[1]); self.startTable[t[1]].append(t[0])
def get_mapped_senses(gold_instance_dict, sys_instance_dict, train_test_splits): mapped_senses = dict() for train, test in train_test_splits: gold_train_senses = get_max_sense(gold_instance_dict, train) sys_train_senses = get_max_sense(sys_instance_dict, train) d = dd(lambda: dd(int)) # build the sense mapping matrix. Each system-gold occurrence equals # 1 point. for g_sense, s_sense in izip(gold_train_senses, sys_train_senses): d[s_sense][g_sense] += 1 # Make majority vote among gold senses for each system sense occurred # together with the same instances. mapping = dict([(s_sense, max(d[s_sense].items(), key=lambda e: e[1])[0]) for s_sense in d]) LOGGER.debug("Mapping: %s", mapping) # Map test senses. If mapping doesn't contain any sense mapping for # particular system sense; skip it. sys_test_senses = get_max_sense(sys_instance_dict, test) mapped_test_chunk = dict([(instance, mapping[s_sense]) for instance, s_sense in izip(test, sys_test_senses) if s_sense in mapping]) mapped_senses.update(mapped_test_chunk) return mapped_senses
def __init__(self, dataFile): self.__data = [] self.__commWiseIndices = {} self.__commWiseSampleIndices = {} self.__commWiseSampleWordFreq = dd(lambda:dd(int)) self.__read(dataFile) self._tok = Tokenizer(preserve_case=False)
def accuracy(test, output): outputLines = [cmp(float(l.strip()),0) for l in open(output)] testLines = [l.strip() for l in open(test)] classDict = {1:'black', -1:'jewish'} A = 0 Correct = dd(int) Actual = dd(int) Given = dd(int) for index in range(len(outputLines)): testClass = int(testLines[index].split()[0]) outputClass = outputLines[index] if testClass == outputClass: A += 1 Correct[classDict[testClass]] += 1 Given[classDict[outputClass]] += 1 Actual[classDict[testClass]] += 1 A = str(round(A*100.0/len(testLines),2)) PBlack = str(round(Correct['black'] *100.0/Given['black'],2)) RBlack = str(round(Correct['black'] *100.0/Actual['black'],2)) PJew = str(round(Correct['jewish'] *100.0/Given['jewish'],2)) RJew = str(round(Correct['jewish'] *100.0/Actual['jewish'],2)) #content = open(output).read() #A = content.split('Accuracy on test set:')[1].split('(')[0].strip() #P = content.split('Precision/recall on test set:')[1].split('/')[0].strip() #C = content.split('Precision/recall on test set:')[1].split('/')[1].strip() return A, PBlack, RBlack, PJew, RJew
def preparePostsSingleDoc(self, outputFile): outputFile = open(outputFile,'w') backGroundVector = dd(int) for key in self.__commWiseSampleIndices.iterkeys(): for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) for token, freq in freqVector.iteritems(): backGroundVector[token] += freq print "Background words:",len(backGroundVector) filteredLexicon = self.__filterWords(backGroundVector) print "Filtered Words:",len(filteredLexicon) ##self.analyzeLexicon(filteredLexicon, backGroundVector) ##sys.exit() for key in self.__commWiseSampleIndices.iterkeys(): globalFreqVector = dd(int) for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) for word, freq in freqVector: globalFreqVector[word] += freq words = [x+"$:$:"+str(y) for x,y in globalFreqVector.iteritems() if x in filteredLexicon] if len(words) > 0: outputFile.write(key+'\t'+' '.join(words)+'\n') outputFile.write('background'+'\t'+' '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n') outputFile.close()
def get_weight_matrix(cites, indices): id2index = {} for i, id in enumerate(indices): id2index[id] = i pair_cnt = dd(int) cited = dd(list) for c1, c2s in cites.iteritems(): for c2 in c2s: cited[c2].append(c1) for ii in c2s: for jj in c2s: if ii == jj or ii not in id2index or jj not in id2index: continue i, j = id2index[ii], id2index[jj] pair_cnt[(i, j)] += 1 for c1, c2s in cited.iteritems(): for ii in c2s: for jj in c2s: if ii == jj or ii not in id2index or jj not in id2index: continue i, j = id2index[ii], id2index[jj] pair_cnt[(i, j)] += 1 row, col, data = [], [], [] for k, v in pair_cnt.iteritems(): i, j = k row.append(i) col.append(j) data.append(v) row, col, data = np.array(row), np.array(col), np.array(data, dtype = np.float32) w = sparse.coo_matrix((data, (row, col)), shape = (len(indices), len(indices))).tocsr() return w
def createPatternMap(stretchyPatterns): print "Creating pattern map" stretchyPatterns = open(stretchyPatterns) patternMap = dd(lambda: dd(int)) count = 0 while 1: stretchyLine = stretchyPatterns.readline().strip() if stretchyLine == "": break for p in set(stretchyLine.split("\t")[1:]): patternMap[p][stretchyLine.split("\t")[0]] += 1 count += 1 try: dummy = 1/(count%5000) except: print count, print return patternMap
def assign_orders_and_drones_to_warehouses(simulation_parameters, weights, warehouse_info, order_info, order_location_matrix): # # print order_location_matrix kmeans, cluster_assignments = find_centers(order_location_matrix, len(warehouse_info)) warehouse_location_matrix = np.array([w['loc'] for w in warehouse_info.values()]) # print warehouse_location_matrix # print kmeans.predict(warehouse_location_matrix) cluster_to_warehouses = find_closest_warehouse(kmeans.cluster_centers_, warehouse_location_matrix) warehouses_to_orders = dd(lambda: dict()) warehouses_order_ids = dd(lambda: dict()) warehouses_drone_numbers = dd(lambda: dict()) total_order_weight = sum([order['weight'] for order in order_info.values()]) # print total_order_weight # print simulation_parameters[2] for (cluster, warehouse) in enumerate(cluster_to_warehouses): assigned_orders = [order for (order, assignment) in enumerate(cluster_assignments) if assignment == cluster] total_order_weight_for_warehouse = sum([order_info[order]['weight'] for order in assigned_orders]) # print total_order_weight_for_warehouse warehouses_to_orders[warehouse]['orders'] = [order for order in assigned_orders] warehouses_order_ids[warehouse] = [order for order in assigned_orders] warehouses_to_orders[warehouse]['n_drones'] = floor(float(total_order_weight_for_warehouse)/total_order_weight*simulation_parameters[2]) warehouses_drone_numbers[warehouse] = [None, None] warehouses_drone_numbers[warehouse][0] = floor(float(total_order_weight_for_warehouse)/float(total_order_weight+len(warehouse_info))*simulation_parameters[2]) warehouses_drone_numbers[warehouse][1] = floor((1-float(total_order_weight_for_warehouse)/float(total_order_weight+len(warehouse_info)))*simulation_parameters[2]) # print warehouses_to_orders[warehouse] # print len(warehouses_to_orders[warehouse]) #print warehouses_to_orders return warehouses_drone_numbers, warehouses_order_ids
def prepareSpeakerBasedData(): print 'Prepaaring speaker based Data..' tweetDir = '/usr0/home/pgadde/Work/Ethnic/WordsBasedFiltering/MapReduce/ScreenNamesBased/' blackTweets = open(tweetDir + 'blackTweets') jewishTweets = open(tweetDir + 'jewishTweets') blackOutput = open(tweetDir + 'blackTweets.sp','w') jewishOutput = open(tweetDir + 'jewishTweets.sp','w') blackSpeakers = dd(list) for tweet in blackTweets: tweet = tweet.strip().split('\t') screenName = tweet[1] content = tweet[-1] blackSpeakers[screenName].append(content) for speaker in blackSpeakers.iterkeys(): blackOutput.write(speaker+'\t'+'\t'.join(blackSpeakers[speaker])+'\n') blackOutput.close() jewishSpeakers = dd(list) for tweet in jewishTweets: tweet = tweet.strip().split('\t') screenName = tweet[1] content = tweet[-1] jewishSpeakers[screenName].append(content) for speaker in jewishSpeakers.iterkeys(): jewishOutput.write(speaker+'\t'+'\t'.join(jewishSpeakers[speaker])+'\n') jewishOutput.close()
def aggregate(golden, guesses): """ Aggregates over the results """ breakdown = dd(lambda : (0.0, 0.0, 0.0, 0.0)) breakdown_N = dd(int) N = 0 A, L, NL, R = 0.0, 0.0, 0.0, 0.0 for tag, gold in golden.items(): if tag in guesses: guess = guesses[tag] # assumes only 1 golden analysis if len(gold) > 1: for elem in gold[1:]: assert elem == gold[0] acc, lev, rank = evaluate(gold[0], guess) A, L, NL, R = A+acc, L+lev, NL+(lev / len(gold[0])), R+rank # compute results broken down by POS tag pos = tag[-1].split(",")[0].replace("pos=", "") _A, _L, _NL, _R = breakdown[pos] breakdown[pos] = _A+acc, _L+lev, _NL+(lev / len(gold[0])), _R+rank breakdown_N[pos] += 1 else: sys.stderr.write("warning: no guess provided for (%s)\n" % " ".join(tag)) N += 1 return A/N, L/N, NL/N, R/N, breakdown, breakdown_N
""" This is an efficient algorithm to find the size of a subtree from every node in O(n) time. The idea is to use one dfs and first calculate the size of subtree of children of a node recursively. Then add the size of each subtree of its children to get the size of its subtree. """ from collections import defaultdict as dd def dfs(source, parent): # Initial size of root is 1 size[source] = 1 for child in graph[source]: if child != parent: # Recursively calculate size of subtree of children nodes dfs(child, source) # Adding size of each child's subtree. size[source] += size[child] size = dd(int) graph = dd(set) n = int(input()) for i in range(n - 1): u, v = map(int, input().split()) graph[u].add(v) graph[v].add(u) dfs(1, 0) print(size)
# changes requests_url_count dict to a list of tuples (url, count), # sorts it in a proper way and prints def print_result(requests_url_count, invalid_lines_count): url_count_list = [(url, count) for url, count in requests_url_count.items()] # sort - count descending, url lexicographically url_count_list.sort(key=lambda p: (-p[1], p[0])) for url, count in url_count_list: print('"{0}",{1}'.format(url, count)) if invalid_lines_count > 0: print('\nInvalid log lines: {0}'.format(invalid_lines_count), file=sys.stderr) if __name__ == '__main__': if len(sys.argv) != 2: print('Usage: python page_report.py <path-to-log-file>\n') else: path = sys.argv[1] with open(path, 'r') as logs: requests_url_count = dd(int) invalid_lines_count = 0 for line in logs: try: stripped_url = parse_line(line) requests_url_count[stripped_url] += 1 except ValueError: invalid_lines_count += 1 print_result(requests_url_count, invalid_lines_count)
], dtype=float) def _get_colors(num_colors): colors = [] for i in np.arange(0., 360., 360. / num_colors): hue = i / 360. lightness = (50 + np.random.rand() * 10) / 100. saturation = (90 + np.random.rand() * 10) / 100. colors.append(colorsys.hls_to_rgb(hue, lightness, saturation)) return colors #for iter in range(itertime): kNN = 5 sga2sgaAaff = dd(set) sgaAsga2aff = dd(int) set_gene = set() path = 'sga2sgaAff_baseline_brca.txt' f = open(path, 'r') pathout = 'out.txt' fo = open(pathout, 'w') next(f) for line in f: l = line.strip().split('\t') sga1, sga2, aff = l[0], l[1], int(l[2]) sga2sgaAaff[sga1].add((sga2, aff)) #sgaAsga2aff[(sga1,sga2)] = aff set_gene.add(sga1) set_gene.add(sga2)
from math import sqrt, log, log2 from fractions import Fraction import random t = int(input()) for _ in range(t): n, k = map(int, input().split()) nums = list(map(int, stdin.readline().split())) # n = 1000 # k = random.randint(1, 10) # nums = [] # for j in range(n): # nums.append(random.randint(1, 10)) freq = dd(int) maxkey, maxval = -1, 0 for i in nums: t = i % k if t != 0: freq[k - t] += 1 if freq[k - t] > maxval: maxval = freq[k - t] maxkey = 0 for key in freq: if freq[key] == maxval and key > maxkey: maxkey = key if len(freq) == 0: print(0)
def doc2bow_from_word_ids(document): counter = dd(int) for word_idx in document: counter[word_idx] += 1 document_bow = sorted(iteritems(counter)) return document_bow
from collections import defaultdict as dd d = dd(int) n = int(input()) for _ in range(n): a, x = map(int, input().split()) d[a] = x m = int(input()) for _ in range(m): b, y = map(int, input().split()) if y > d[b]: d[b] = y print(sum(d.values()))
def setup(self): self.train_test = np.append(self.data['train']['x_t'], self.data['test']['x_t'], 0) self.train_losses = dd(list) self.test_losses = dd(list) self.model_losses = [] # Location of 0/-1 in the transformed space self.zero_line = self.model.scalery.inverse_transform( np.zeros((1, self.data['train']['y_t'].shape[-1]))) self.neg_line = self.model.scalery.inverse_transform( np.zeros((1, self.data['train']['y_t'].shape[-1])) - 1) if self.args.darktheme: plt.style.use('dark_background') n_ext = 3 # extra rows, in addition to 1-1 scatter plots n_col = min(5, self.data['test']['y'].shape[1]) n_row = n_ext + (n_col + n_col - 1) // n_col fig = plt.figure(figsize=(5 * n_col, 2 * n_row)) meta = enumerate(GridSpec(n_row, 1, hspace=0.35)) conts = [ GridSubplot(1, 2 if i in [0, n_row - 1, n_row - 2] else n_col, subplot_spec=o, wspace=0.3 if i else 0.45) for i, o in meta ] axs = [ plt.Subplot(fig, sub) for container in conts for sub in container ] axs = axs[:n_col + 2] + axs[-4:] [fig.add_subplot(ax) for ax in axs] self.axes = [ax.twinx() for ax in axs[:2]] + axs self.labels = get_labels(get_sensor_bands(self.args.sensor, self.args), self.model.output_slices, n_col)[:n_col] plt.ion() plt.show() plt.pause(1e-9) if self.args.animate: ani_path = Path('Animations') ani_tmp = ani_path.joinpath('tmp') ani_tmp.mkdir(parents=True, exist_ok=True) list(map(os.remove, ani_tmp.glob( '*.png'))) # Delete any prior run temporary animation files # '-tune zerolatency' fixes issue where firefox won't play the mp4 # '-vf pad=...' ensures height/width are divisible by 2 (required by .h264 - https://stackoverflow.com/questions/20847674/ffmpeg-libx264-height-not-divisible-by-2) extra_args = [ "-tune", "zerolatency", "-vf", "pad=width=ceil(iw/2)*2:height=ceil(ih/2)*2:color=white" ] ani_writer = self.ani_writer = animation.writers['ffmpeg_file']( fps=3, extra_args=extra_args) ani_writer.setup(fig, ani_path.joinpath('MDN.mp4').as_posix(), dpi=100, frame_prefix=ani_tmp.joinpath('_').as_posix(), clear_temp=False)
def __init__(self): self.d = dd()
def get_estimates(args, x_train=None, y_train=None, x_test=None, y_test=None, output_slices=None, dataset_labels=None, x_sim=None, y_sim=None, return_model=False, return_coefs=False): ''' Estimate all target variables for the given x_test. If a model doesn't already exist, creates a model with the given training data. ''' # Add x/y scalers to the args object generate_scalers(args, x_train, x_test) if args.verbose: print( f'\nUsing {len(args.wavelengths)} wavelength(s) in the range [{args.wavelengths[0]}, {args.wavelengths[-1]}]' ) if x_train is not None: print_dataset_stats(x=x_train, label='Train') if y_train is not None: print_dataset_stats(y=y_train, label='Train') if x_test is not None: print_dataset_stats(x=x_test, label='Test') if y_test is not None: print_dataset_stats(y=y_test, label='Test') # Add a few additional variables to be stored in the generated config file if x_train is not None: setattr(args, 'data_xtrain_shape', x_train.shape) if y_train is not None: setattr(args, 'data_ytrain_shape', y_train.shape) if x_test is not None: setattr(args, 'data_xtest_shape', x_test.shape) if y_test is not None: setattr(args, 'data_ytest_shape', y_test.shape) if dataset_labels is not None: sets_str = ','.join(sorted(map(str, np.unique(dataset_labels)))) sets_hash = hashlib.sha256(sets_str.encode('utf-8')).hexdigest() setattr(args, 'datasets_hash', sets_hash) model_path = generate_config(args, create=x_train is not None) args.config_name = model_path.name predict_kwargs = { 'avg_est': getattr(args, 'avg_est', False), 'threshold': getattr(args, 'threshold', None), 'confidence_interval': getattr(args, 'CI', None), 'use_gpu': getattr(args, 'use_gpu', False), 'chunk_size': getattr(args, 'chunk_size', 1e5), 'return_coefs': True, } x_full, y_full = x_train, y_train x_valid, y_valid = None, None outputs = dd(list) for round_num in trange(args.n_rounds, disable=args.verbose or (args.n_rounds == 1) or args.silent): args.curr_round = round_num curr_round_seed = args.seed + round_num if args.seed is not None else None np.random.seed(curr_round_seed) # 75% of rows used in bagging if using_feature( args, 'bagging') and x_train is not None and args.n_rounds > 1: (x_train, y_train), (x_valid, y_valid) = split_data(x_full, y_full, n_train=0.75, seed=curr_round_seed) datasets = { k: dict(zip(['x', 'y'], v)) for k, v in { 'train': [x_train, y_train], 'valid': [x_valid, y_valid], 'test': [x_test, y_test], 'full': [x_full, y_full], 'sim': [x_sim, y_sim], }.items() if v[0] is not None } model_kwargs = { 'n_mix': args.n_mix, 'hidden': [args.n_hidden] * args.n_layers, 'lr': args.lr, 'l2': args.l2, 'n_iter': args.n_iter, 'batch': args.batch, 'imputations': args.imputations, 'epsilon': args.epsilon, 'scalerx': TransformerPipeline( [S(*args, **kwargs) for S, args, kwargs in args.x_scalers]), 'scalery': TransformerPipeline( [S(*args, **kwargs) for S, args, kwargs in args.y_scalers]), 'model_path': model_path.joinpath(f'Round_{round_num}'), 'no_load': args.no_load, 'no_save': args.no_save, 'seed': curr_round_seed, 'verbose': args.verbose, } model = MDN(**model_kwargs) model.fit(x_train, y_train, output_slices, args=args, datasets=datasets) if return_model: outputs['model'].append(model) if return_coefs: outputs['scalerx'].append(model.scalerx) outputs['scalery'].append(model.scalery) if x_test is not None: (estimates, *confidence), coefs = model.predict(x_test, **predict_kwargs) outputs['estimates'].append(estimates) if return_coefs: outputs['coefs'].append(coefs) if len(confidence): upper, lower = confidence outputs['upper_bound'].append(upper) outputs['lower_bound'].append(lower) if args.verbose and y_test is not None: median = np.median(outputs['estimates'], axis=0) labels = get_labels(args.wavelengths, output_slices, n_out=y_test.shape[1]) for lbl, y1, y2 in zip(labels, y_test.T, median.T): print(performance(f'{lbl:>7s} Median', y1, y2)) print(f'--- Done round {round_num} ---\n') if hasattr(model, 'session'): model.session.close() # Create compressed model archive compress(model_path) if len(outputs) == 1: outputs = list(outputs.values())[0] return outputs, model.output_slices
def summarize_sample_pairs(self): from modules.Rage_Plots import rage_subplots xLen, yLen = 5, 5 subplot = rage_subplots.subplot(xLen, yLen, True) total_features = len(self.input.features) f_num = 1 LOG = True feature_sample_ranks = dd(lambda: dd(float)) for s in self.input.samples: for i, (b, a) in enumerate( sorted([(b, a) for (a, b) in self.input.sample_vals[s].items()])): if i == 0: match, rank, m_list = b, 1, [a] elif b == match: m_list.append(a) else: for m in m_list: feature_sample_ranks[s][m] = rank match, rank, m_list = b, rank + 1, [a] feature_sample_ranks for m in m_list: feature_sample_ranks[s][m] = rank f_num = 1 fig = matplotlib.pyplot.gcf() fig.set_size_inches(18.5, 9.5) s_id = '' for i in range(len(self.input.samples)): for j in range(i + 1, len(self.input.samples)): s1, s2 = self.input.samples[i], self.input.samples[j] fr1, fr2 = feature_sample_ranks[s1], feature_sample_ranks[s2] fkeys = list(set(fr1.keys() + fr2.keys())) f_order = [ x[1] for x in sorted([(fr1[f] + fr2[f], f) for f in fkeys]) ] x_range = range(len(f_order)) v1 = [ log(1.0 + self.input.sample_vals[s1][f]) if f in self.input.sample_vals[s1] else 0 for f in f_order ] v2 = [ log(1.0 + self.input.sample_vals[s2][f]) if f in self.input.sample_vals[s2] else 0 for f in f_order ] vs1 = scale_vals(v1) vs2 = scale_vals(v2) sv1 = svgf(vs1, 61, 2, mode='nearest') sv2 = svgf(vs2, 61, 2, mode='nearest') subplot.add_line(x_range, sv1, {'lw': 0.2}) subplot.add_line(x_range, sv2, {'lw': 0.2}) sv_mix = [(sv1[z] + sv2[z]) / 2.0 for z in range(len(sv1))] step1, step2 = 50, 100 subplot.add_line(x_range, sv_mix, {'lw': 0.5, 'color': 'k'}) z_diffs, z_steps = [], [] for z in range(step2, len(sv_mix), step1): z1 = sv1[z - step2:z + step2] z2 = sv2[z - step2:z + step2] z_diffs.append( sum([(z1[x] - z2[x]) * (z1[x] - z2[x]) for x in range(len(z1))])) z_steps.append((z - step2, z + step2)) #subplot.add_line(x_range[z-step2:z+step2],sv_mix[z-step2:z+step2],{'color': 'purple','alpha':0.4}) diff_colors = get_colors(z_diffs, plt.cm.jet) for z in range(len(z_steps)): zA, zB = z_steps[z] subplot.add_line(x_range[zA:zB], sv_mix[zA:zB], { 'color': diff_colors[z], 'alpha': 0.5, 'lw': 1 }) #subplot.change_limits({'x1': int(len(x_range)*1.08), 'y0': -0.05,'y1': 0.93}) subplot.ax.text(int(len(x_range) * 0.03), 0.72, s1 + ' ' + s_id + ' ' + s2 + ' ' + s_id, color='red') #subplot.ax.plot([0,len(x_range)],[0,0],color='k',linewidth=1,zorder=2) if not subplot.update(): plt.suptitle('Pair Comparison') plt.subplots_adjust(left=0.04, bottom=0.01, right=0.96, top=0.95, wspace=0.03, hspace=0.03) fig.savefig('pairs_out' + str(f_num) + '.png', dpi=100) f_num += 1 if f_num > 10: sys.exit() sys.exit()
def search_omw(lang=None, q=None): if lang and q: lang_id = lang lang_id2 = lang2 query = q else: lang_id = request.form['lang'] lang_id2 = request.form['lang2'] query = request.form['query'] query = query.strip() sense = dd(list) lang_sense = dd(lambda: dd(list)) # GO FROM FORM TO SENSE for s in query_omw( """ SELECT s.id as s_id, ss_id, wid, fid, lang_id, pos_id, lemma FROM (SELECT w_id as wid, form.id as fid, lang_id, pos_id, lemma FROM (SELECT id, lang_id, pos_id, lemma FROM f WHERE lemma GLOB ? AND lang_id in (?,?)) as form JOIN wf_link ON form.id = wf_link.f_id) word JOIN s ON wid=w_id """, [ '[' + query[0].upper() + query[0].lower() + ']' + query[1:], lang_id, lang_id2 ]): sense[s['ss_id']] = [ s['s_id'], s['wid'], s['fid'], s['lang_id'], s['pos_id'], s['lemma'] ] lang_sense[s['lang_id']][s['ss_id']] = [ s['s_id'], s['wid'], s['fid'], s['pos_id'], s['lemma'] ] pos = fetch_pos() lang_dct, lang_code = fetch_langs() ss, senses, defs, exes, links = fetch_ss_basic(sense.keys()) labels = fetch_labels(lang_id, set(senses.keys())) resp = make_response( render_template('omw_results.html', langsel=int(lang_id), langsel2=int(lang_id2), pos=pos, lang_dct=lang_dct, sense=sense, senses=senses, ss=ss, links=links, defs=defs, exes=exes, labels=labels)) resp.set_cookie('selected_lang', lang_id) resp.set_cookie('selected_lang2', lang_id2) return resp
def makemut(args, hc, avoid, alignopts): mutid_list = [] for site in hc: mutid_list.append(site['chrom'] + '_' + str(site['start']) + '_' + str(site['end']) + '_' + str(site['vaf']) + '_' + str(site['altbase'])) try: if args.seed is not None: random.seed(int(args.seed) + int(hc[0]['start'])) bamfile = pysam.Samfile(args.bamFileName, 'rb') bammate = pysam.Samfile( args.bamFileName, 'rb') # use for mates to avoid iterator problems reffile = pysam.Fastafile(args.refFasta) tmpbams = [] #snvfrac = float(args.snvfrac) chrom = None vaf = None mutpos_list = [] altbase_list = [] for site in hc: if chrom is None: chrom = site['chrom'] else: assert chrom == site[ 'chrom'], "haplotype clusters cannot span multiple chromosomes!" if vaf is None: vaf = site['vaf'] elif vaf != site['vaf']: logger.warning( "multiple VAFs for single haplotype, using first encountered VAF: %f" % vaf) mutpos = int(random.uniform(site['start'], site['end'] + 1)) # position of mutation in genome mutpos_list.append(mutpos) altbase_list.append(site['altbase']) mutbase_list = [] refbase_list = [] mutstr_list = [] for n, mutpos in enumerate(mutpos_list): refbase = reffile.fetch(chrom, mutpos - 1, mutpos) altbase = altbase_list[n] refbase_list.append(refbase) if altbase == refbase.upper() and not args.ignoreref: logger.warning( "%s specified ALT base matches reference, skipping mutation" % mutid_list[n]) return None try: mutbase = mut(refbase, altbase) mutbase_list.append(mutbase) except ValueError as e: logger.warning(mutid_list[n] + " " + ' '.join( ("skipped site:", chrom, str(hc[n]['start']), str(hc[n]['end']), "due to N base:", str(e), "\n"))) return None mutstr_list.append(refbase + "-->" + str(mutbase)) # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') hapstr = "_".join( ('haplo', chrom, str(min(mutpos_list)), str(max(mutpos_list)))) log = open( 'addsnv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + hapstr + ".log", 'w') tmpoutbamname = args.tmpdir + "/" + hapstr + ".tmpbam." + str( uuid4()) + ".bam" logger.info("%s creating tmp bam: %s" % (hapstr, tmpoutbamname)) outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile) mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate( args, log, bamfile, bammate, chrom, min(mutpos_list), max(mutpos_list) + 1, mutpos_list, avoid=avoid, mutid_list=mutid_list, is_snv=True, mutbase_list=mutbase_list, reffile=reffile) if mutfail: outbam_muts.close() os.remove(tmpoutbamname) return None # pick reads to change readlist = [] for extqname, read in outreads.iteritems(): if read.seq != mutreads[extqname]: readlist.append(extqname) logger.info("%s len(readlist): %s" % (hapstr, str(len(readlist)))) readlist.sort() random.shuffle(readlist) if len(readlist) < int(args.mindepth): logger.warning("%s too few reads in region (%s) skipping..." % (hapstr, str(len(readlist)))) outbam_muts.close() os.remove(tmpoutbamname) return None if vaf is None: vaf = float( args.mutfrac ) # default minor allele freq if not otherwise specified if cnv: # cnv file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom, min(mutpos_list), max(mutpos_list) + 1): cn = float(cnregion.strip().split() [3]) # expect chrom,start,end,CN logger.info(hapstr + "\t" + ' '.join(("copy number in snp region:", chrom, str(min(mutpos_list)), str(max(mutpos_list)), "=", str(cn)))) if float(cn) > 0.0: vaf = 1.0 / float(cn) else: vaf = 0.0 logger.info("%s adjusted VAF: %f" % (hapstr, vaf)) else: logger.info("%s selected VAF: %f" % (hapstr, vaf)) lastread = int(len(readlist) * vaf) # pick at least args.minmutreads if possible if lastread < int(args.minmutreads): if len(readlist) > int(args.minmutreads): lastread = int(args.minmutreads) logger.warning("%s forced %d reads." % (hapstr, lastread)) else: logger.warning( "%s dropped site with fewer reads than --minmutreads" % hapstr) os.remove(tmpoutbamname) return None readtrack = dd(list) for readname in readlist: orig_name, readpos, pairend = readname.split(',') readtrack[orig_name].append('%s,%s' % (readpos, pairend)) usedreads = 0 newreadlist = [] for orig_name in readtrack: for read_instance in readtrack[orig_name]: newreadlist.append(orig_name + ',' + read_instance) usedreads += 1 if usedreads >= lastread: break readlist = newreadlist logger.info("%s picked: %d" % (hapstr, len(readlist))) wrote = 0 nmut = 0 mut_out = {} # change reads from .bam to mutated sequences for extqname, read in outreads.iteritems(): if read.seq != mutreads[extqname]: if not args.nomut and extqname in readlist: qual = read.qual # changing seq resets qual (see pysam API docs) read.seq = mutreads[extqname] # make mutation read.qual = qual nmut += 1 if (not hasSNP) or args.force: wrote += 1 mut_out[extqname] = read muts_written = {} for extqname in mut_out: if extqname not in muts_written: outbam_muts.write(mut_out[extqname]) muts_written[extqname] = True if mutmates[extqname] is not None: # is mate also in mutated list? mate_read = mutmates[extqname] pairname = 'F' # read is first in pair if mate_read.is_read2: pairname = 'S' # read is second in pair if not mate_read.is_paired: pairname = 'U' # read is unpaired mateqname = ','.join( (mate_read.qname, str(mate_read.pos), pairname)) if mateqname in mut_out: # yes: output mutated mate outbam_muts.write(mut_out[mateqname]) muts_written[mateqname] = True else: # no: output original mate outbam_muts.write(mate_read) logger.info("%s wrote: %d, mutated: %d" % (hapstr, wrote, nmut)) if not hasSNP or args.force: outbam_muts.close() aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, mutid=hapstr, paired=(not args.single), picardjar=args.picardjar, insane=args.insane) outbam_muts = pysam.Samfile(tmpoutbamname, 'rb') coverwindow = 1 incover = countReadCoverage(bamfile, chrom, min(mutpos_list) - coverwindow, max(mutpos_list) + coverwindow) outcover = countReadCoverage(outbam_muts, chrom, min(mutpos_list) - coverwindow, max(mutpos_list) + coverwindow) avgincover = float(sum(incover)) / float(len(incover)) avgoutcover = float(sum(outcover)) / float(len(outcover)) logger.info("%s avgincover: %f, avgoutcover: %f" % (hapstr, avgincover, avgoutcover)) spikein_snvfrac = 0.0 if wrote > 0: spikein_snvfrac = float(nmut) / float(wrote) # qc cutoff for final snv depth if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover >= float(args.coverdiff)) or args.force: tmpbams.append(tmpoutbamname) for n, site in enumerate(hc): snvstr = chrom + ":" + str(site['start']) + "-" + str( site['end']) + " (VAF=" + str(vaf) + ")" log.write("\t".join(("snv", snvstr, str(mutpos_list[n]), mutstr_list[n], str(avgoutcover), str(avgoutcover), str(spikein_snvfrac), str(maxfrac))) + "\n") else: outbam_muts.close() os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') logger.warning("%s dropped for outcover/incover < %s" % (hapstr, str(args.coverdiff))) return None outbam_muts.close() bamfile.close() bammate.close() log.close() return tmpbams except Exception, e: sys.stderr.write("*" * 60 + "\nERROR\t" + now() + "\tencountered error in mutation spikein: " + str(mutid_list) + "\n") traceback.print_exc(file=sys.stdout) sys.stderr.write("*" * 60 + "\n") if os.path.exists(tmpoutbamname): os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') return None
for i in range(len(dlugosci)): L = set() for s in slowa[dlugosci[i]]: for k in range(1, 32): slowo2 = ceasar(s, k) if (slowo2 in slowa[dlugosci[i]]): L.add(s) if (L != set()): return L return None plik = open("popularne.txt") slowa = dd(set) dl = set() for w in plik: w.split() w = w.strip() if (zle_slowo(w) == False): continue else: slowa[len(w)].add(w) dl.add(len(w)) dlugosci = list(dl) dlugosci.sort(reverse=True) print(najdlcesarskie(dlugosci, slowa))
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts): ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end''' if args.seed is not None: random.seed(int(args.seed) + int(start)) mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf) if ins is None: mutid += ':DEL' else: mutid += ':INS:' + ins try: bamfile = pysam.Samfile(args.bamFileName, 'rb') bammate = pysam.Samfile(args.bamFileName, 'rb') # use for mates to avoid iterator problems reffile = pysam.Fastafile(args.refFasta) tmpbams = [] is_insertion = ins is not None is_deletion = ins is None snvfrac = float(args.snvfrac) mutstr = get_mutstr(chrom, start, end, ins, reffile) del_ln = 0 if is_deletion: del_ln = end-start mutpos = start mutpos_list = [start] # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') log = open('addindel_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + "_".join((chrom,str(start),str(end))) + ".log",'w') tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str(uuid4()) + ".bam" logger.info("%s creating tmp bam: %s" % (mutid ,tmpoutbamname)) outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile) mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(args, log, bamfile, bammate, chrom, mutpos, mutpos+del_ln+1, mutpos_list, avoid=avoid, mutid_list=[mutid], is_insertion=is_insertion, is_deletion=is_deletion, ins_seq=ins, reffile=reffile, indel_start=start, indel_end=end) if mutfail: outbam_muts.close() os.remove(tmpoutbamname) return None # pick reads to change readlist = [] for extqname,read in outreads.iteritems(): if read.seq != mutreads[extqname]: readlist.append(extqname) logger.info("%s len(readlist): %d" % (mutid, len(readlist))) readlist.sort() random.shuffle(readlist) if len(readlist) < int(args.mindepth): logger.warning("%s skipped, too few reads in region: %d" % (mutid, str(len(readlist)))) outbam_muts.close() os.remove(tmpoutbamname) return None if vaf is None: vaf = float(args.mutfrac) # default minor allele freq if not otherwise specified if cnv: # cnv file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom,start,end): cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN logger.info(mutid + "\t" + ' '.join(("copy number in snp region:",chrom,str(start),str(end),"=",str(cn)))) if float(cn) > 0.0: vaf = 1.0/float(cn) else: vaf = 0.0 logger.info("%s adjusted VAF: %f" % (mutid, vaf)) else: logger.info("%s selected VAF: %f" % (mutid, vaf)) lastread = int(len(readlist)*vaf) # pick at least args.minmutreads if possible if lastread < int(args.minmutreads): if len(readlist) > int(args.minmutreads): lastread = int(args.minmutreads) logger.warning("%s forced %d reads" % (mutid, lastread)) else: logger.warning("%s dropped site with fewer reads than --minmutreads" % mutid) os.remove(tmpoutbamname) return None readtrack = dd(list) for readname in readlist: orig_name, readpos, pairend = readname.split(',') readtrack[orig_name].append('%s,%s' % (readpos, pairend)) usedreads = 0 newreadlist = [] for orig_name in readtrack: for read_instance in readtrack[orig_name]: newreadlist.append(orig_name + ',' + read_instance) usedreads += 1 if usedreads >= lastread: break readlist = newreadlist logger.info("%s picked: %d reads" % (mutid, len(readlist))) wrote = 0 nmut = 0 mut_out = {} # change reads from .bam to mutated sequences for extqname,read in outreads.iteritems(): if read.seq != mutreads[extqname]: if not args.nomut and extqname in readlist: qual = read.qual # changing seq resets qual (see pysam API docs) read.seq = mutreads[extqname] # make mutation read.qual = qual nmut += 1 if not hasSNP or args.force: wrote += 1 mut_out[extqname] = read muts_written = {} for extqname in mut_out: if extqname not in muts_written: outbam_muts.write(mut_out[extqname]) muts_written[extqname] = True if mutmates[extqname] is not None: # is mate also in mutated list? mate_read = mutmates[extqname] pairname = 'F' # read is first in pair if mate_read.is_read2: pairname = 'S' # read is second in pair if not mate_read.is_paired: pairname = 'U' # read is unpaired mateqname = ','.join((mate_read.qname,str(mate_read.pos),pairname)) if mateqname in mut_out: # yes: output mutated mate outbam_muts.write(mut_out[mateqname]) muts_written[mateqname] = True else: # no: output original mate outbam_muts.write(mate_read) logger.info("%s wrote: %d, mutated: %d" % (mutid,wrote,nmut)) if not hasSNP or args.force: outbam_muts.close() aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, mutid=mutid, paired=(not args.single), picardjar=args.picardjar, insane=args.insane) outbam_muts = pysam.Samfile(tmpoutbamname,'rb') coverwindow = 1 incover = countReadCoverage(bamfile,chrom,mutpos-coverwindow,mutpos+del_ln+coverwindow) outcover = countReadCoverage(outbam_muts,chrom,mutpos-coverwindow,mutpos+del_ln+coverwindow) avgincover = float(sum(incover))/float(len(incover)) avgoutcover = float(sum(outcover))/float(len(outcover)) spikein_frac = 0.0 if wrote > 0: spikein_frac = float(nmut)/float(wrote) # qc cutoff for final snv depth if (avgoutcover > 0 and avgincover > 0 and avgoutcover/avgincover >= float(args.coverdiff)) or args.force: tmpbams.append(tmpoutbamname) indelstr = '' if is_insertion: indelstr = ':'.join(('INS', chrom, str(start), ins)) else: indelstr = ':'.join(('DEL', chrom, str(start), str(end))) snvstr = chrom + ":" + str(start) + "-" + str(end) + " (VAF=" + str(vaf) + ")" log.write("\t".join(("indel",indelstr,str(mutpos),mutstr,str(avgincover),str(avgoutcover),str(spikein_frac),str(maxfrac)))+"\n") else: outbam_muts.close() os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') logger.warning("%s dropped for outcover/incover < %s" % (mutid, str(args.coverdiff))) return None outbam_muts.close() bamfile.close() bammate.close() log.close() return sorted(tmpbams) except Exception, e: sys.stderr.write("*"*60 + "\nencountered error in mutation spikein: " + mutid + "\n") traceback.print_exc(file=sys.stdout) sys.stderr.write("*"*60 + "\n") if os.path.exists(tmpoutbamname): os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') return None
def record(key, key2, d): if key in d: d[key][key2] += 1 else: d[key] = dd(int) d[key2] = 1
def comp_term_freq(entities): term_freq = dd(int) for ent in entities: for token in jieba.lcut(ent): term_freq[token] += 1 return term_freq
for file in file_list: try: comment_list = pickle.load(open(file, "rb")) for j in comment_list: yield (trim(j["raw_message"]).split()) except: continue logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) comment_pattern = re.compile("^comments_(1000_)?2016.*") prev_file = None for site in site_list: site_d = site + "/" files = dd(list) for i in os.listdir(direc_prefix + site_d): if comment_pattern.match(i): files[find_month(int( day_finder.search(i).group(1)))].append(direc_prefix + site_d + i) for month in months: file_list = files[month] s = sentences(file_list) if prev_file is None: model = word2vec.Word2Vec(s, iter=10) else: model = gensim.models.Word2Vec.load(prev_file) model.train(s, iter=10, size=300, workers=10) prev_file = result_storage_direc + "{}2016{}.w2v".format( site, month)
parent[child] = source dfs(child, source) def precompute(): dfs(0) # Considering 0 as the root of the tree. for i in range(lg + 1): for j in range(n): if i == 0: table[i][j] = parent[j] continue table[i][j] = table[i - 1][table[i - 1][j]] n, m = map(int, input().split()) # The number of nodes and number of edges graph = dd(set) for i in range(m): u, v = map(int, input().split()) graph[u].add(v) graph[v].add(u) lg = int(log(n, 2)) # To pre compute the parents of nodes at a distance of powers of two. parent = dd(lambda: -1) table = dd(lambda: dd(lambda: -1)) precompute() node, ancestor = map( int, input().split()) # Enter the node and i-th ancestor we want to find for i in range(lg + 1): if ancestor & 1: node = table[i][node]
def make_dictionary(self, question_dir, vocab_file, freq_file): if os.path.exists(vocab_file) and os.path.exists(freq_file): print "loading vocabularies from " + vocab_file + " ..." vocabularies = map(lambda x: x.strip(), open(vocab_file).readlines()) word2freq = cPickle.load(open(freq_file)) else: print "no " + vocab_file + " found, constructing the vocabulary list ..." fnames = [] fnames += glob.glob(question_dir + "/test/*.question") fnames += glob.glob(question_dir + "/validation/*.question") fnames += glob.glob(question_dir + "/training/*.question") vocab_set = set() n = 0. word2freq = dd(int) for fname in fnames: fp = open(fname) fp.readline() fp.readline() document = fp.readline().split() fp.readline() query = fp.readline().split() fp.close() vocab_set |= set(document) | set(query) for word in document: word2freq[word] += 1 for word in query: word2freq[word] += 1 word2freq[SYMB_BEGIN] += 2 word2freq[SYMB_END] += 2 # show progress n += 1 if n % 10000 == 0: print '%3d%%' % int(100 * n / len(fnames)) entities = set(e for e in vocab_set if e.startswith('@entity')) # @placehoder, @begin and @end are included in the vocabulary list tokens = vocab_set.difference(entities) tokens.add(SYMB_BEGIN) tokens.add(SYMB_END) vocabularies = list(entities) + list(tokens) print "writing vocabularies to " + vocab_file + " ..." vocab_fp = open(vocab_file, "w") vocab_fp.write('\n'.join(vocabularies)) vocab_fp.close() freqs = [v for k, v in word2freq.iteritems()] freqs.sort() freq2index = {} bin_size = len(freqs) / BIN_NUM + 1 for i, start in enumerate(range(0, len(freqs), bin_size)): end = min(start + bin_size, len(freqs)) for j in range(start, end): freq2index[freqs[j]] = i for k in word2freq.keys(): word2freq[k] = freq2index[word2freq[k]] cPickle.dump(word2freq, open(freq_file, 'w'), cPickle.HIGHEST_PROTOCOL) vocab_size = len(vocabularies) word_dictionary = dict(zip(vocabularies, range(vocab_size))) char_set = set([c for w in vocabularies for c in list(w)]) char_set.add(' ') char_dictionary = dict(zip(list(char_set), range(len(char_set)))) num_entities = len( [v for v in vocabularies if v.startswith('@entity')]) print "vocab_size = %d" % vocab_size print "num characters = %d" % len(char_set) print "%d anonymoused entities" % num_entities print "%d other tokens (including @placeholder, %s and %s)" % ( vocab_size - num_entities, SYMB_BEGIN, SYMB_END) return word_dictionary, char_dictionary, num_entities, word2freq
def count_tagsets(f, delimiter="\t", gold_analysis_in_the_first_position=False, verbose=False): tagsets_dict = dd(int) root_and_analysis_cooccurence = {} surface_form_and_gold_analysis_cooccurence = {} ambiguity_scores = [] def record(key, key2, d): if key in d: d[key][key2] += 1 else: d[key] = dd(int) d[key2] = 1 def record_root_and_analysis_cooccurence(root, analysis): record(root, analysis, root_and_analysis_cooccurence) def record_surface_form_and_gold_analysis_cooccurence( surface_form, analysis): record(surface_form, analysis, surface_form_and_gold_analysis_cooccurence) current_tagset = [] current_roots = [] analyses_idx = 0 sentence_length = 0 line = f.readline() # print line while line: line = line.strip() tokens = line.split(delimiter) # print tokens if len(tokens) == 3: if gold_analysis_in_the_first_position and analyses_idx == 0: record_surface_form_and_gold_analysis_cooccurence( tokens[0], tokens[2]) if analyses_idx == 0 and verbose: print("SURFACE FORM: %s" % tokens[0]) current_tagset += [tokens[2]] current_roots += [tokens[1]] record_root_and_analysis_cooccurence(tokens[1], tokens[2]) analyses_idx += 1 if tokens[0] in ["<S>", "<DOC>", "<TITLE>", "</DOC>", "</TITLE>"]: sentence_length = 0 current_product_of_ambiguities = 1 elif tokens[0] == "</S>": ambiguity_score = current_product_of_ambiguities / float( sentence_length) if sentence_length != 0 else 0.0 ambiguity_scores.append([ambiguity_score, sentence_length]) elif len(tokens) == 1: # tagset ended if len(current_tagset) > 0: tree_root = TreeNode(None, "ROOT") root_to_anonymized_root = { root: ("X%d" % (idx + 1)) for idx, root in enumerate(sorted(set(current_roots))) } sorted_tagset = sorted(zip( [root_to_anonymized_root[root] for root in current_roots], current_tagset), key=lambda x: x[1]) tagsets_dict["\n".join([x + y for x, y in sorted_tagset])] += 1 current_product_of_ambiguities *= len(current_tagset) # trees for tagset_as_seq in [(x + y).split("+") for x, y in sorted_tagset]: insert_into_tree(tree_root, tagset_as_seq) if verbose: unanonymized_sorted_tagset = sorted(zip( current_roots, current_tagset), key=lambda x: x[1]) print(unanonymized_sorted_tagset) print(sorted_tagset) tr = LeftAligned() print(tr(tree_root.print_children_recursive())) # clear current_tagset = [] current_roots = [] analyses_idx = 0 sentence_length += 1 elif len(tokens) == 2: # <DOC> or <TITLE> OR <S> OR </S> pass line = f.readline() return tagsets_dict, root_and_analysis_cooccurence, surface_form_and_gold_analysis_cooccurence, ambiguity_scores
def __init__(self, options, data): self.options = options self.names = [f.name for f in data.features] self.sample_names = [s.name for s in data.samples] self.marker_stats = dd(lambda: dd(bool))
class LabelToSeed: seeds = dd(list) tweets = [] labels = [] def find_emo_cause(self, tweet_file, label_file): """ Find the emotion causes :param tweet_file: tokenized tweet file :param label_file: labeled tweet file :return: void """ with open(tweet_file, 'r') as t: for words in t: words = words.split() words = [w.split(":")[0] for w in words] self.tweets.append(words) with open(label_file, 'r') as l: for tags in l: tags = tags.split() self.labels.append(tags) for idx in range(len(self.labels)): self.extract_emo_cause(self.labels[idx], self.tweets[idx]) self.pickle_seeds() def extract_emo_cause(self, labels, words): """ Extract the emotions and causes :param labels: list of labels :param words: list of words :return: void """ emo_flag = False cause_flag = False emo = [] cause = [] if len(labels) != len(words): raise Warning("Tweet tokens and labels do not match: ", Warning) else: for idx, label in enumerate(labels): if label == "I-E": emo.append(words[idx]) elif label == "I-C": cause.append(words[idx]) elif label == "O": if emo_flag: emo_flag = False if cause: self.seeds[" ".join(emo)].append(" ".join(cause)) emo = [] cause = [] elif cause_flag: cause_flag = False if emo: self.seeds[" ".join(emo)].append(" ".join(cause)) emo = [] cause = [] elif label == "B-E": if cause_flag: cause_flag = False if emo: self.seeds[" ".join(emo)].append(" ".join(cause)) emo = [] cause = [] emo.append(words[idx]) emo_flag = True elif label == "B-C": if emo_flag: emo_flag = False if cause: self.seeds[" ".join(emo)].append(" ".join(cause)) emo = [] cause = [] cause.append(words[idx]) cause_flag = True else: raise Warning("Unknown label encountered: " + label, Warning) # When BIO tags occur at end of sentence if emo_flag or cause_flag: if emo and cause: self.seeds[" ".join(emo)].append(" ".join(cause)) def pickle_seeds(self): """ Pickle the seed data :return: void """ pickle.dump(self.seeds, open('../../lib/seeds/train_seeds.pkl', 'wb'))
def logitR(self, target, target_names, Y, F, left_out=5): logit = LogisticRegression(penalty='l1') id_Ys, coef_key = [[y[i] for y in Y] for i in range(len(target))], dd(lambda: dd(list)) out = rage_classify_outputs.Classifier_Output(self.options) TRAIN_IDXS, CAND_IDXS, TEST_IDXS, s_key, p_key = [], [], [], {}, dd( list) sample_grades, gene_grades = {}, dd(lambda: dd(list)) for i in range(len(target)): sample_grades[self.sample_names[i]] = dd(list) if type(target[i]) == int: s_key[i] = [ self.sample_names[i], True, target_names[target[i]] ] TRAIN_IDXS.append(i) CAND_IDXS.append(i) else: s_key[i] = [self.sample_names[i], False, target[i]] TEST_IDXS.append(i) while len(CAND_IDXS) > 0: if len(CAND_IDXS) > left_out: test_set = list([ x for x in np.random.choice( CAND_IDXS, left_out, replace=False) ]) else: test_set = CAND_IDXS train_set = [i for i in TRAIN_IDXS if i not in test_set] train_opts = [target_names[target[t]] for t in train_set] if len(list(set(train_opts))) != len(target_names): print 'uh' left_out -= 1 continue Yj, Tj = [id_Ys[i] for i in train_set], [target[i] for i in train_set] logit.fit(Yj, Tj) p_coeffs = logit.coef_ preds = logit.predict(id_Ys) probs = logit.predict_proba(id_Ys) for i in range(len(p_coeffs)): for k in range(len(p_coeffs[i])): coef_key[k][i].append(p_coeffs[i][k]) for j in TEST_IDXS + test_set: pr = sorted(probs[j], reverse=True) p_key[j].append((target_names[preds[j]], pr[0] - pr[1])) sName, sBool, sTrue = s_key[j] sPredict = target_names[preds[j]] if len(p_key[j]) > 0: for k, F_name in enumerate(F): jVal = Y[k][j] jMults = sorted({ Xk: jVal * Xv[-1] for Xk, Xv in coef_key[k].items() }.items(), key=lambda XX: XX[1]) for cIdx, cVals in enumerate(p_coeffs): if cVals[k] != 0: cMult = cVals[k] * jVal if jMults[-1][1] > 0: mTarget = target_names[jMults[-1][0]] sample_grades[sName][mTarget].append( [F_name, -1 * jMults[-1][1]]) if jMults[0][1] < 0: mTarget = target_names[jMults[0][0]] sample_grades[sName][mTarget].append( [F_name, -1 * jMults[-1][1]]) sOutcome = (sTrue, sPredict) if sTrue == sPredict: sBoolOutcome = 'YES' else: sBoolOutcome = 'NO' for t in target_names: tPos = sorted( [(f_val, f_name) for f_name, f_val in sample_grades[sName][t] if f_val > 0], reverse=True) tNeg = sorted( [(f_val, f_name) for f_name, f_val in sample_grades[sName][t] if f_val < 0], reverse=False) if sTrue != t: for fi, f_data in enumerate(tPos): gene_grades[f_data[-1]]['FALSE_POS'].append( (fi + 1, sBoolOutcome, sOutcome)) for fi, f_data in enumerate(tNeg): gene_grades[f_data[-1]]['TRUE_NEG'].append( (fi + 1, sBoolOutcome, sOutcome)) if sTrue == t: for fi, f_data in enumerate(tPos): gene_grades[f_data[-1]]['TRUE_POS'].append( (fi + 1, sBoolOutcome, sOutcome)) for fi, f_data in enumerate(tNeg): gene_grades[f_data[-1]]['FALSE_NEG'].append( (fi + 1, sBoolOutcome, sOutcome)) out.add_score(s_key[j], p_key[j]) if j in CAND_IDXS: CAND_IDXS.remove(j) elif j in TEST_IDXS: TEST_IDXS.remove(j) for g in gene_grades: GK = {} for k in gene_grades[g].keys(): k_data = gene_grades[g][k] rank_all = [kd[0] for kd in k_data] rank_yes = [kd[0] for kd in k_data if kd[1] == 'YES'] rank_no = [kd[0] for kd in k_data if kd[1] == 'NO'] ram, raL = np.mean(rank_all), len(rank_all) rym, ryL = np.mean(rank_yes), len(rank_yes) rnm, rnL = np.mean(rank_no), len(rank_no) GK[k] = [ram, raL, rym, ryL, rnm, rnL] out.add_gene_grades(g, GK) for k, f_name in enumerate(F): out.add_coefs(f_name, coef_key[k], target_names)
import sys from collections import defaultdict as dd sys.setrecursionlimit(1000000) d=dd(list) mod=1000000007 n=int(input()) for _ in range(n-1): a,b=map(int,input().split()) d[a].append(b) d[b].append(a) d=dict(d) f=[None]*(n+1) g=[None]*(n+1) def dfs(p,x): for i in d[x]: if i!=p: dfs(x,i) f[x],g[x]=1,1 for i in d[x]: if i!=p: g[x]*=f[i] g[x]%=mod f[x]*=g[i] f[x]%=mod f[x]+=g[x] f[x]%=mod
import matplotlib import matplotlib.pyplot as plt import numpy as np plik=open("wyniki_wyborow.tsv") dane=list()#miasto-ilemandatow-wyniki for w in plik: w=list(w.split("\t")) dane.append(w[1:-1]) komitety=dane[0][2:]#nazwy partii dane=dane[1:] wyniki=dd(list)#miasto,alfa -wyniki[wynik,partia] for lista in dane: miasto=lista[0] for dzielnik in range(1,int(lista[1])): alfa=0.1 while(alfa<2): x=dzielnik**alfa for i in range(len(lista[2:])): partia=komitety[i] procent=lista[i+2] if(',' in procent): procent=procent.replace(',','.') procent=float(procent) wyniki[(miasto,alfa)].append([procent/x,partia]) continue
def logitU(self, target, target_names, Y, F, left_out=5): logit = LogisticRegression(penalty='l1') iterations = 5 known_key, unk_key = {}, {} known_idxs,known_vals, unk_idxs,unk_vals,valid_names,valid_target,valid_key = [],[],[],[],[],[],{} for i in range(len(target)): if target_names[target[i]].split('~')[-1].upper()[0:3] == 'UNK': unk_idxs.append(i) unk_vals.append([y[i] for y in Y]) unk_key[len(unk_idxs) - 1] = [self.sample_names[i], target_names[target[i]]] else: known_idxs.append(i) known_vals.append([y[i] for y in Y]) known_key[len(known_idxs) - 1] = [self.sample_names[i], target_names[target[i]]] if target[i] not in valid_key: valid_names.append(target_names[target[i]]) valid_key[target[i]] = len(valid_names) - 1 valid_target.append(valid_key[target[i]]) novel_key = dd(list) iter_key = dd(list) left_idxs = np.random.choice(range(len(valid_target)), left_out, replace=False) while True: left_vals, left_target = [known_vals[i] for i in left_idxs ], [valid_target[i] for i in left_idxs] iter_vals, iter_target = [ v for i, v in enumerate(known_vals) if i not in left_idxs ], [v for i, v in enumerate(valid_target) if i not in left_idxs] if len(list(set(iter_target))) < len(valid_names): left_out -= 1 continue logit.fit(iter_vals, iter_target) p_coeffs = logit.coef_ pred_unk = logit.predict(unk_vals) prob_unk = logit.predict_proba(unk_vals) pred_val = logit.predict(left_vals) prob_val = logit.predict_proba(left_vals) for i in range(len(unk_idxs)): novel_key[i].append( (valid_names[pred_unk[i]], prob_unk[i][pred_unk[i]])) for i, j in enumerate(left_idxs): iter_key[j].append( (valid_names[pred_val[i]], prob_val[i][pred_val[i]])) left_cands = [ i for i in range(len(valid_target)) if len(iter_key[i]) < 4 ] if len(left_cands) == 0: break elif len(left_cands) <= left_out: left_idxs = left_cands else: left_idxs = np.random.choice(left_cands, left_out, replace=False) out = rage_classify_outputs.Classifier_Unknown_Output(self.options) for i, dubs in novel_key.items(): votes = dd(float) name, orig_id = unk_key[i] for a, b in dubs: votes[a] += b scrs = sorted(votes.items(), key=lambda X: X[1], reverse=True) out.add_pred(name, orig_id, scrs[0][0], scrs[0][1] / sum([sc[1] for sc in scrs])) for i, dubs in iter_key.items(): votes = dd(float) name, orig_id = known_key[i] for a, b in dubs: votes[a] += b scrs = sorted(votes.items(), key=lambda X: X[1], reverse=True) out.add_pred(name, orig_id, scrs[0][0], scrs[0][1] / sum([sc[1] for sc in scrs]))
def parse_gene_ref(ref_gene): ##some gene ids are not in txt form, provide ability to parse BED path, ext = os.path.splitext(ref_gene) if ext.lower() == '.bed': reader = BEDFile(ref_gene) else: reader = KnownGeneFile(ref_gene) gene_ref = dd( list ) #all of the genes in a chromosome in dictionary with keys = chromID and value = [list of genes] gene_info = { } #all information about a each gene, keys = geneID and value = dictunary of all info about gene chrom_info = { } #all of the chromosomes with info about gene stored in AVL tree, i.e keys - chromID and value = AVL(all genes arranged by startSyt) for ref_dict in reader: if ext.lower() == '.bed': ref_dict['txStart'] = ref_dict['chromStart'] ref_dict['txEnd'] = ref_dict['chromEnd'] # determine intervals for promoter, gene, and downstream if ref_dict['strand'] == '+': #if gene in 5' to 3' orientation promoter_coords = max( ref_dict['txStart'] - 1 - opts.upst_win, 0 ), ref_dict['txStart'] - 1 #find the start and end of the promoter gene_coords = ref_dict['txStart'], ref_dict[ 'txEnd'] #find the start and end of gene #use these coordinates if we're trying to window around TSS window_coords = ref_dict['txStart'] + 1, ref_dict[ 'txStart'] + opts.dnst_win downstream_coords = ref_dict['txEnd'] + 1, ref_dict[ 'txEnd'] + 1 + opts.dnst_win ref_dict['promoter_coords'] = promoter_coords ref_dict['gene_coords'] = gene_coords ref_dict['window_coords'] = window_coords ref_dict['downstream_coords'] = downstream_coords else: promoter_coords = ref_dict['txEnd'] + 1, ref_dict[ 'txEnd'] + 1 + opts.upst_win # +1 because we're using 1 based indexing gene_coords = ref_dict['txStart'], ref_dict['txEnd'] window_coords = ref_dict['txEnd'] - opts.dnst_win, ref_dict[ 'txEnd'] downstream_coords = ref_dict[ 'txStart'] - 1 - opts.dnst_win, ref_dict[ 'txStart'] - 1 # -1 because we're using 1 based indexing ref_dict['promoter_coords'] = promoter_coords ref_dict['gene_coords'] = gene_coords ref_dict['window_coords'] = window_coords ref_dict['downstream_coords'] = downstream_coords gene_ref[ref_dict['chrom']].append(ref_dict) gene_info[ref_dict['name']] = ref_dict #putting relevant information about the gene into our AVL tree based on the chromosome if ref_dict['chrom'] not in chrom_info.keys(): chrom_info[ref_dict['chrom']] = avl.AVLTree( ) #making a new instance of an AVL tree if ref_dict['strand'] == '+': chrom_info[ref_dict['chrom']].insert( (ref_dict['promoter_coords'][0], ref_dict['downstream_coords'][1], ref_dict['name'])) else: chrom_info[ref_dict['chrom']].insert( (ref_dict['downstream_coords'][0], ref_dict['promoter_coords'][1], ref_dict['name'])) else: if ref_dict['strand'] == '+': chrom_info[ref_dict['chrom']].insert( (ref_dict['promoter_coords'][0], ref_dict['downstream_coords'][1], ref_dict['name'])) else: chrom_info[ref_dict['chrom']].insert( (ref_dict['downstream_coords'][0], ref_dict['promoter_coords'][1], ref_dict['name'])) return gene_ref, gene_info, chrom_info
type=str, help='Column name for gene id in the other file') ap.add_argument('--other_disease', required=True, type=str, help='Column name for disease id in the other file') ap.add_argument( '--pheno_series', required=True, type=str, help= 'A json dictionary containing mappings between disease mim IDs ant their phenotypic series number' ) args = ap.parse_args() omim_dict = dd(set) with open(args.omim) as omim_fh: header = omim_fh.readline().strip().split("\t") for line in omim_fh: lines = line.strip().split("\t") #Dictionary indexed by gene id, containing a set of disease identifiers as values omim_dict[lines[header.index(args.omim_gene)]].add(lines[header.index( args.omim_disease)]) with open(args.pheno_series, 'r') as ps_h: ps = json.load(ps_h) with open(args.other) as other_fh: header = other_fh.readline().strip().split("\t") print("\t".join(header)) for line in other_fh:
peak_info[peak['name']] = peak if peak['chrom'] not in chrom_peaks.keys(): chrom_peaks[peak['chrom']] = [(peak[start_field], peak['name'])] else: chrom_peaks[peak['chrom']].append( (peak[start_field], peak['name'])) peaks_writer = DictWriter(peak_output, output_fields, delimiter='\t', extrasaction='ignore', lineterminator='\n') peaks_writer.writerow(dict([(k, k) for k in output_fields])) unique_genes = set() map_stats = dd(int) rowcount = 0 interval = 1000 if totalrows > 100000: interval = 10000 print '\nParsing %d rows from peak file and will provide update every %d rows' % ( totalrows, interval) peaks_without_genes = [] genes_without_peaks = [] #walk through the peaks in a chromosome for chrom in chrom_peaks: heapq.heapify( chrom_peaks[chrom]) #sort them based on order on the chromosome
# Print experiment configurations. keys = list(args.__dict__.keys()) keys.sort() strings = "=" * 64 + "\n" + "\n".join( [k + "=" + str(args.__dict__[k]) for k in keys]) + "\n" + "_" * 64 print(strings) with open(os.path.join(result_dir, "log.txt"), "a") as f: f.write(strings + "\n") # Load the training and test sets of Drug-Disease Relations dataset (DDR). X_train, X_test, X_elmo_train, X_elmo_test, \ X_sparse_train, X_sparse_test, y_train, \ y_test, feat2idx, vocab = get_ddr_dataset(args) # Record the ground truth and model predictions. results = dd(dd) results["X_test"] = X_test results["y_test"] = y_test # Train the model. rnn = model(vocab=vocab, model_name=args.model_name, max_iter=args.max_iter, eta=args.eta, batch_size=args.batch_size, test_batch_size=args.test_batch_size, hid_dim=args.hid_dim, hid_hb_dim=args.hid_hb_dim, emb_dim=args.emb_dim, feat_dim=len(feat2idx), max_len=args.max_len,